From ad1ecc6d8531f4824bb96095c6abeb785c2683e0 Mon Sep 17 00:00:00 2001 From: formath Date: Tue, 24 Apr 2018 14:41:40 +0800 Subject: [PATCH 0001/3053] op not register --- tensorflow/contrib/makefile/Makefile | 7 +++++++ tensorflow/contrib/makefile/download_dependencies.sh | 4 ++++ tensorflow/contrib/makefile/tf_op_files.txt | 4 +++- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile index 05e8d9064be..bc68316fb32 100644 --- a/tensorflow/contrib/makefile/Makefile +++ b/tensorflow/contrib/makefile/Makefile @@ -89,6 +89,8 @@ HOST_INCLUDES := \ -I$(MAKEFILE_DIR)/downloads/gemmlowp \ -I$(MAKEFILE_DIR)/downloads/nsync/public \ -I$(MAKEFILE_DIR)/downloads/fft2d \ +-I$(MAKEFILE_DIR)/downloads/farmhash/src \ +-I$(MAKEFILE_DIR)/downloads/highwayhash \ -I$(HOST_GENDIR) ifeq ($(HAS_GEN_HOST_PROTOC),true) HOST_INCLUDES += -I$(MAKEFILE_DIR)/gen/protobuf-host/include @@ -171,6 +173,8 @@ INCLUDES := \ -I$(MAKEFILE_DIR)/downloads/gemmlowp \ -I$(MAKEFILE_DIR)/downloads/nsync/public \ -I$(MAKEFILE_DIR)/downloads/fft2d \ +-I$(MAKEFILE_DIR)/downloads/farmhash/src \ +-I$(MAKEFILE_DIR)/downloads/highwayhash \ -I$(PROTOGENDIR) \ -I$(PBTGENDIR) ifeq ($(HAS_GEN_HOST_PROTOC),true) @@ -326,6 +330,8 @@ $(MARCH_OPTION) \ -I$(MAKEFILE_DIR)/downloads/gemmlowp \ -I$(MAKEFILE_DIR)/downloads/nsync/public \ -I$(MAKEFILE_DIR)/downloads/fft2d \ +-I$(MAKEFILE_DIR)/downloads/farmhash/src \ +-I$(MAKEFILE_DIR)/downloads/highhash\ -I$(MAKEFILE_DIR)/gen/protobuf_android/$(ANDROID_ARCH)/include \ -I$(PROTOGENDIR) \ -I$(PBTGENDIR) @@ -677,6 +683,7 @@ endif # TEGRA TF_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS)) # Add in any extra files that don't fit the patterns easily TF_CC_SRCS += tensorflow/contrib/makefile/downloads/fft2d/fftsg.c +TF_CC_SRCS += tensorflow/contrib/makefile/downloads/farmhash/src/farmhash.cc TF_CC_SRCS += tensorflow/core/common_runtime/gpu/gpu_id_manager.cc # Also include the op and kernel definitions. TF_CC_SRCS += $(shell cat $(MAKEFILE_DIR)/tf_op_files.txt) diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh index 4d3de36e2a4..5ebbd97c821 100755 --- a/tensorflow/contrib/makefile/download_dependencies.sh +++ b/tensorflow/contrib/makefile/download_dependencies.sh @@ -37,6 +37,8 @@ RE2_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)" ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)" CUB_URL="$(grep -o 'https.*cub/archive.*zip' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)" +FARMHASH_URL="https://mirror.bazel.build/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz" +HIGHWAYHASH_URL="https://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz" # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64, # so work around it by patching the source. @@ -91,6 +93,8 @@ download_and_extract "${RE2_URL}" "${DOWNLOADS_DIR}/re2" download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d" download_and_extract "${ABSL_URL}" "${DOWNLOADS_DIR}/absl" download_and_extract "${CUB_URL}" "${DOWNLOADS_DIR}/cub/external/cub_archive" +download_and_extract "${FARMHASH_URL}" "${DOWNLOADS_DIR}/farmhash" +download_and_extract "${HIGHWAYHASH_URL}" "${DOWNLOADS_DIR}/highwayhash" replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \ "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h" diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt index d4c3f2eda8b..cd269f60170 100644 --- a/tensorflow/contrib/makefile/tf_op_files.txt +++ b/tensorflow/contrib/makefile/tf_op_files.txt @@ -299,8 +299,10 @@ tensorflow/core/kernels/spacetobatch_functor.cc tensorflow/core/kernels/spacetobatch_op.cc tensorflow/core/kernels/batchtospace_op.cc tensorflow/core/kernels/warn_about_ints.cc -tensorflow/core/kernels/segment_reduction_ops.cc tensorflow/core/kernels/batch_util.cc tensorflow/core/ops/audio_ops.cc tensorflow/core/kernels/decode_proto_op.cc tensorflow/core/kernels/encode_proto_op.cc +tensorflow/core/ops/lookup_ops.cc +tensorflow/core/kernels/as_string_op.cc +tensorflow/core/kernels/string_to_hash_bucket_op.cc From 40e91b23fc426dd6c2025dfd26888488e08d8c7a Mon Sep 17 00:00:00 2001 From: formath Date: Fri, 11 May 2018 16:26:49 +0800 Subject: [PATCH 0002/3053] add op --- tensorflow/contrib/makefile/tf_op_files.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt index cd269f60170..d66e0e804fd 100644 --- a/tensorflow/contrib/makefile/tf_op_files.txt +++ b/tensorflow/contrib/makefile/tf_op_files.txt @@ -306,3 +306,4 @@ tensorflow/core/kernels/encode_proto_op.cc tensorflow/core/ops/lookup_ops.cc tensorflow/core/kernels/as_string_op.cc tensorflow/core/kernels/string_to_hash_bucket_op.cc +tensorflow/core/kernels/snapshot_op.cc From 3ec2f536fa13a091fad6d09bf5287252db7912e5 Mon Sep 17 00:00:00 2001 From: Michael Date: Wed, 12 Sep 2018 10:16:59 +0300 Subject: [PATCH 0003/3053] Add hessian computation for sparse softmax xent. --- .../kernel_tests/sparse_xent_op_test.py | 48 ++++++++++----- tensorflow/python/ops/nn_grad.py | 60 ++++++++++++------- 2 files changed, 70 insertions(+), 38 deletions(-) diff --git a/tensorflow/python/kernel_tests/sparse_xent_op_test.py b/tensorflow/python/kernel_tests/sparse_xent_op_test.py index a841fe83a7f..43ee9a8d587 100644 --- a/tensorflow/python/kernel_tests/sparse_xent_op_test.py +++ b/tensorflow/python/kernel_tests/sparse_xent_op_test.py @@ -188,7 +188,7 @@ class SparseXentTest(test.TestCase): self._testXent(np.zeros((0, 3)), np.zeros((0,), dtype=np.int32)) def testGradient(self): - with self.test_session(use_gpu=True): + with self.test_session(use_gpu=True) as sess: l = constant_op.constant([3, 0, 1], name="l") f = constant_op.constant( [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4], @@ -198,25 +198,43 @@ class SparseXentTest(test.TestCase): x = nn_ops.sparse_softmax_cross_entropy_with_logits( labels=l, logits=f, name="xent") err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3]) + + # Check that no extra computation performed. When only first derivative is requested, + # second derivative must not be computed. So when there is no second derivative, + # there is no `BatchMatMul` op in the graph. + op_names = [ + op.op_def.name for op in sess.graph.get_operations() if op.op_def + ] + self.assertNotIn("BatchMatMul", op_names) + print("cross entropy gradient err = ", err) self.assertLess(err, 5e-8) def testSecondGradient(self): - images_placeholder = array_ops.placeholder(dtypes.float32, shape=(3, 2)) - labels_placeholder = array_ops.placeholder(dtypes.int32, shape=(3)) - weights = variables.Variable(random_ops.truncated_normal([2], stddev=1.0)) - weights_with_zeros = array_ops.stack([array_ops.zeros([2]), weights], - axis=1) - logits = math_ops.matmul(images_placeholder, weights_with_zeros) - cross_entropy = nn_ops.sparse_softmax_cross_entropy_with_logits( - labels=labels_placeholder, logits=logits) - loss = math_ops.reduce_mean(cross_entropy) + with self.test_session() as sess: + l = constant_op.constant([3, 0, 1], name="l") + f = constant_op.constant( + [0.3, 0.4, 0.1, 1.2, 0.1, 1.9, 0.1, 0.7, 0.8, 0.2, 1.3, 1.3], + shape=[3, 4], + dtype=dtypes.float64, + name="f") + x = nn_ops.sparse_softmax_cross_entropy_with_logits( + labels=l, logits=f, name="xent") - # Taking ths second gradient should fail, since it is not - # yet supported. - with self.assertRaisesRegexp(LookupError, - "explicitly disabled"): - _ = gradients_impl.hessians(loss, [weights]) + gradients = gradients_impl.gradients(x, [f])[0] + err = gradient_checker.compute_gradient_error( + f, [3, 4], gradients, [3, 4]) + + # Check that second derivative is calculated. + # (it is equivalent to being `BatchMatMul` op in the graph because of + # implementation of xentropy grad) + op_names = [ + op.op_def.name for op in sess.graph.get_operations() if op.op_def + ] + self.assertIn("BatchMatMul", op_names) + + print("cross entropy hessian err = ", err) + self.assertLess(err, 5e-8) def _testHighDim(self, features, labels): np_loss, np_backprop = self._npXent(np.array(features), np.array(labels)) diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py index e1a01ab4c32..224230a4810 100644 --- a/tensorflow/python/ops/nn_grad.py +++ b/tensorflow/python/ops/nn_grad.py @@ -444,6 +444,24 @@ def _BroadcastMul(vec, mat): return vec * mat +def _IsZero(tensor): + """Check if tensor contains only zeros. + + Args: + tensor: tensor to check + + Returns: + True if tensor contains only zeros and False otherwise + """ + if context.executing_eagerly(): + # TODO(apassos) add an efficient way to detect eager zeros here. + return False + if tensor.op.type in ("ZerosLike", "Zeros"): + return True + const_fill_value = tensor_util.constant_value(tensor) + return const_fill_value is not None and (const_fill_value == 0).all() + + @ops.RegisterGradient("SoftmaxCrossEntropyWithLogits") def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad): """Gradient function for SoftmaxCrossEntropyWithLogits.""" @@ -455,18 +473,8 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad): softmax_grad = op.outputs[1] grad = _BroadcastMul(grad_loss, softmax_grad) - def IsZero(g): - # Some introspection to check if the gradient is feeding zeros - if context.executing_eagerly(): - # TODO(apassos) add an efficient way to detect eager zeros here. - return False - if g.op.type in ("ZerosLike", "Zeros"): - return True - const_fill_value = tensor_util.constant_value(g) - return const_fill_value is not None and (const_fill_value == 0).all() - logits = op.inputs[0] - if grad_grad is not None and not IsZero(grad_grad): + if grad_grad is not None and not _IsZero(grad_grad): softmax = nn_ops.softmax(logits) grad += ((grad_grad - array_ops.squeeze( @@ -479,22 +487,28 @@ def _SoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad): @ops.RegisterGradient("SparseSoftmaxCrossEntropyWithLogits") -def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_0, _): +def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad): """Gradient function for SparseSoftmaxCrossEntropyWithLogits.""" - # grad_0 is the backprop for cost, and we multiply it with the gradients + # grad_loss is the backprop for cost, and we multiply it with the gradients # (which is output[1]) + # grad_grad is the backprop for softmax gradient. # There is no gradient for the labels # - # Currently there is no way to take the second derivative of this op - # due to the fused implementation's interaction with tf.gradients(), - # so we make sure we prevent silently incorrect results by raising - # an error if the second derivative is requested via prevent_gradient. - sparse_softmax_grad_without_gradient = array_ops.prevent_gradient( - op.outputs[1], - message="Currently there is no way to take the second " - "derivative of sparse_softmax_cross_entropy_with_logits due to the fused " - "implementation's interaction with tf.gradients()") - return _BroadcastMul(grad_0, sparse_softmax_grad_without_gradient), None + # Second derivative is just softmax derivative w.r.t. logits. + softmax_grad = op.outputs[1] + grad = _BroadcastMul(grad_loss, softmax_grad) + + logits = op.inputs[0] + if grad_grad is not None and not _IsZero(grad_grad): + softmax = nn_ops.softmax(logits) + + grad += ((grad_grad - array_ops.squeeze( + math_ops.matmul(array_ops.expand_dims(grad_grad, 1), + array_ops.expand_dims(softmax, 2)), + axis=1)) * + softmax) + + return grad, None @ops.RegisterGradient("Conv2D") From d614ab369b6c59433fa4750b38592f48ae1b5a45 Mon Sep 17 00:00:00 2001 From: Michael Date: Sat, 29 Sep 2018 15:04:38 +0300 Subject: [PATCH 0004/3053] Remove unused imports. --- tensorflow/python/kernel_tests/sparse_xent_op_test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tensorflow/python/kernel_tests/sparse_xent_op_test.py b/tensorflow/python/kernel_tests/sparse_xent_op_test.py index 43ee9a8d587..230c477079d 100644 --- a/tensorflow/python/kernel_tests/sparse_xent_op_test.py +++ b/tensorflow/python/kernel_tests/sparse_xent_op_test.py @@ -35,9 +35,7 @@ from tensorflow.python.ops import gradient_checker from tensorflow.python.ops import gradients_impl from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn_ops -from tensorflow.python.ops import random_ops from tensorflow.python.ops import sparse_ops -from tensorflow.python.ops import variables import tensorflow.python.ops.nn_grad # pylint: disable=unused-import from tensorflow.python.platform import app from tensorflow.python.platform import test From 0331117652b2f0c7b6aa010501644c21348ca66c Mon Sep 17 00:00:00 2001 From: Michael Date: Sun, 30 Sep 2018 18:47:32 +0300 Subject: [PATCH 0005/3053] Fix sparse softmax xent grad in eager mode. --- tensorflow/python/eager/pywrap_tfe_src.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc index c6a55949ab5..c041805e4c3 100644 --- a/tensorflow/python/eager/pywrap_tfe_src.cc +++ b/tensorflow/python/eager/pywrap_tfe_src.cc @@ -1857,7 +1857,6 @@ bool OpGradientDoesntRequireInputIndices( {"Relu6", {true, {}}}, {"Elu", {true, {}}}, {"Selu", {true, {}}}, - {"SparseSoftmaxCrossEntropyWithLogits", {true, {}}}, {"Neg", {true, {}}}, {"Inv", {true, {}}}, {"Reciprocal", {true, {}}}, @@ -1875,6 +1874,7 @@ bool OpGradientDoesntRequireInputIndices( // Ops that don't require a subset of inputs. {"FusedBatchNorm", {false, {2}}}, + {"SparseSoftmaxCrossEntropyWithLogits", {false, {1}}}, }); auto it = m->find(op_name); From 632d0596ec1c8e264c25310a829e2b7d41062abb Mon Sep 17 00:00:00 2001 From: frreiss Date: Fri, 19 Oct 2018 13:11:19 -0700 Subject: [PATCH 0006/3053] Original changes, rolled into a single commit --- .../python/feature_column/feature_column.py | 6 +- .../feature_column/feature_column_test.py | 26 +- .../feature_column/feature_column_v2.py | 6 +- .../feature_column/feature_column_v2_test.py | 24 +- .../python/kernel_tests/check_ops_test.py | 71 +- tensorflow/python/ops/check_ops.py | 701 +++++++----------- 6 files changed, 390 insertions(+), 444 deletions(-) diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py index b1f47ebec2a..7cf0d2048d2 100644 --- a/tensorflow/python/feature_column/feature_column.py +++ b/tensorflow/python/feature_column/feature_column.py @@ -2330,7 +2330,7 @@ class _LazyBuilder(object): if rank is not None: if rank == 0: raise ValueError( - 'Feature (key: {}) cannot have rank 0. Give: {}'.format( + 'Feature (key: {}) cannot have rank 0. Given: {}'.format( key, feature_tensor)) return feature_tensor if rank != 1 else expand_dims(feature_tensor) @@ -3103,9 +3103,13 @@ class _IdentityCategoricalColumn( # Fail if values are out-of-range. assert_less = check_ops.assert_less( values, num_buckets, data=(values, num_buckets), + message='Bucket index for categorical column ' + '"{}" exceeds number of buckets'.format(self.name), name='assert_less_than_num_buckets') assert_greater = check_ops.assert_greater_equal( values, zero, data=(values,), + message='Negative bucket index for categorical column "{}"'.format( + self.name), name='assert_greater_or_equal_0') with ops.control_dependencies((assert_less, assert_greater)): values = array_ops.identity(values) diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py index 1ae510250cf..a45ccb58329 100644 --- a/tensorflow/python/feature_column/feature_column_test.py +++ b/tensorflow/python/feature_column/feature_column_test.py @@ -4277,29 +4277,35 @@ class IdentityCategoricalColumnTest(test.TestCase): def test_get_sparse_tensors_with_inputs_too_small(self): column = fc.categorical_column_with_identity(key='aaa', num_buckets=3) - inputs = sparse_tensor.SparseTensorValue( + inputs_value = sparse_tensor.SparseTensorValue( indices=((0, 0), (1, 0), (1, 1)), values=(1, -1, 0), dense_shape=(2, 2)) - id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs})) + inputs_placeholder = array_ops.sparse_placeholder(dtypes.int32) + id_weight_pair = column._get_sparse_tensors( + _LazyBuilder({'aaa': inputs_placeholder})) self.assertIsNone(id_weight_pair.weight_tensor) - with _initialized_session(): + with _initialized_session() as sess: with self.assertRaisesRegexp( - errors.OpError, 'assert_greater_or_equal_0'): - id_weight_pair.id_tensor.eval() + errors.OpError, 'Negative bucket index'): + sess.run(id_weight_pair.id_tensor, + feed_dict={inputs_placeholder: inputs_value}) def test_get_sparse_tensors_with_inputs_too_big(self): column = fc.categorical_column_with_identity(key='aaa', num_buckets=3) - inputs = sparse_tensor.SparseTensorValue( + inputs_value = sparse_tensor.SparseTensorValue( indices=((0, 0), (1, 0), (1, 1)), values=(1, 99, 0), dense_shape=(2, 2)) - id_weight_pair = column._get_sparse_tensors(_LazyBuilder({'aaa': inputs})) + inputs_placeholder = array_ops.sparse_placeholder(dtypes.int32) + id_weight_pair = column._get_sparse_tensors( + _LazyBuilder({'aaa': inputs_placeholder})) self.assertIsNone(id_weight_pair.weight_tensor) - with _initialized_session(): + with _initialized_session() as sess: with self.assertRaisesRegexp( - errors.OpError, 'assert_less_than_num_buckets'): - id_weight_pair.id_tensor.eval() + errors.OpError, 'exceeds number of buckets'): + sess.run(id_weight_pair.id_tensor, + feed_dict={inputs_placeholder: inputs_value}) def test_get_sparse_tensors_with_default_value(self): column = fc.categorical_column_with_identity( diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py index aeb666cf6a5..875f43e4529 100644 --- a/tensorflow/python/feature_column/feature_column_v2.py +++ b/tensorflow/python/feature_column/feature_column_v2.py @@ -2203,7 +2203,7 @@ class FeatureTransformationCache(object): if rank is not None: if rank == 0: raise ValueError( - 'Feature (key: {}) cannot have rank 0. Give: {}'.format( + 'Feature (key: {}) cannot have rank 0. Given: {}'.format( key, feature_tensor)) return feature_tensor if rank != 1 else expand_dims(feature_tensor) @@ -3387,9 +3387,13 @@ class IdentityCategoricalColumn( # Fail if values are out-of-range. assert_less = check_ops.assert_less( values, num_buckets, data=(values, num_buckets), + message='Bucket index for categorical column ' + '"{}" exceeds number of buckets'.format(self.name), name='assert_less_than_num_buckets') assert_greater = check_ops.assert_greater_equal( values, zero, data=(values,), + message='Negative bucket index for categorical column "{}"'.format( + self.name), name='assert_greater_or_equal_0') with ops.control_dependencies((assert_less, assert_greater)): values = array_ops.identity(values) diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py index 31bc0485ef0..3a5ca62bd88 100644 --- a/tensorflow/python/feature_column/feature_column_v2_test.py +++ b/tensorflow/python/feature_column/feature_column_v2_test.py @@ -5151,35 +5151,39 @@ class IdentityCategoricalColumnTest(test.TestCase): def test_get_sparse_tensors_with_inputs_too_small(self): column = fc.categorical_column_with_identity(key='aaa', num_buckets=3) - inputs = sparse_tensor.SparseTensorValue( + inputs_value = sparse_tensor.SparseTensorValue( indices=((0, 0), (1, 0), (1, 1)), values=(1, -1, 0), dense_shape=(2, 2)) + inputs_placeholder = array_ops.sparse_placeholder(dtypes.int32) id_weight_pair = column.get_sparse_tensors( fc.FeatureTransformationCache({ - 'aaa': inputs + 'aaa': inputs_placeholder }), None) self.assertIsNone(id_weight_pair.weight_tensor) - with _initialized_session(): + with _initialized_session() as sess: with self.assertRaisesRegexp( - errors.OpError, 'assert_greater_or_equal_0'): - id_weight_pair.id_tensor.eval() + errors.OpError, 'Negative bucket index'): + sess.run(id_weight_pair.id_tensor, + feed_dict={inputs_placeholder: inputs_value}) def test_get_sparse_tensors_with_inputs_too_big(self): column = fc.categorical_column_with_identity(key='aaa', num_buckets=3) - inputs = sparse_tensor.SparseTensorValue( + inputs_value = sparse_tensor.SparseTensorValue( indices=((0, 0), (1, 0), (1, 1)), values=(1, 99, 0), dense_shape=(2, 2)) + inputs_placeholder = array_ops.sparse_placeholder(dtypes.int32) id_weight_pair = column.get_sparse_tensors( fc.FeatureTransformationCache({ - 'aaa': inputs + 'aaa': inputs_placeholder }), None) self.assertIsNone(id_weight_pair.weight_tensor) - with _initialized_session(): + with _initialized_session() as sess: with self.assertRaisesRegexp( - errors.OpError, 'assert_less_than_num_buckets'): - id_weight_pair.id_tensor.eval() + errors.OpError, 'exceeds number of buckets'): + sess.run(id_weight_pair.id_tensor, + feed_dict={inputs_placeholder: inputs_value}) def test_get_sparse_tensors_with_default_value(self): column = fc.categorical_column_with_identity( diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py index 88f5cd6f223..90514e3976b 100644 --- a/tensorflow/python/kernel_tests/check_ops_test.py +++ b/tensorflow/python/kernel_tests/check_ops_test.py @@ -140,8 +140,7 @@ Corresponding y values: First 6 elements of x: \[2 2 3 3 6 6\] First 6 elements of y: -\[20 2 3 30 60 6\] -""" +\[20 2 3 30 60 6\]""" expected_error_msg_default = r"""big does not equal small Condition x == y did not hold. Indices of first 3 different values: @@ -155,8 +154,7 @@ Corresponding y values: First 3 elements of x: \[2 2 3\] First 3 elements of y: -\[20 2 3\] -""" +\[20 2 3\]""" expected_error_msg_short = r"""big does not equal small Condition x == y did not hold. Indices of first 2 different values: @@ -169,8 +167,7 @@ Corresponding y values: First 2 elements of x: \[2 2\] First 2 elements of y: -\[20 2\] -""" +\[20 2\]""" with context.eager_mode(): big = constant_op.constant([[2, 2], [3, 3], [6, 6]]) small = constant_op.constant([[20, 2], [3, 30], [60, 6]]) @@ -302,11 +299,17 @@ class AssertNoneEqualTest(test.TestCase): x = check_ops.assert_none_equal(t1, t2) assert x is None + def test_static_check_in_graph_mode(self): + with context.graph_mode(): + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Custom error message"): + check_ops.assert_none_equal(1, 1, message="Custom error message") + def test_error_message_eager(self): # Note that the following three strings are regexes - expected_error_msg_full = r"""0.0, 1.0, 2.0, 3.0, 4.0, 5.0""" - expected_error_msg_default = r"""0.0, 1.0, 2.0, \.\.\.""" - expected_error_msg_short = r"""0.0, 1.0, \.\.\.""" + expected_error_msg_full = r"""\[0\. 1\. 2\. 3\. 4\. 5\.\]""" + expected_error_msg_default = r"""\[0\. 1\. 2\.\]""" + expected_error_msg_short = r"""\[0\. 1\.\]""" with context.eager_mode(): t = constant_op.constant( np.array(range(6)), shape=[2, 3], dtype=np.float32) @@ -506,6 +509,12 @@ class AssertLessTest(test.TestCase): x = check_ops.assert_less(t1, t2) assert x is None + def test_static_check_in_graph_mode(self): + with context.graph_mode(): + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Custom error message"): + check_ops.assert_none_equal(1, 1, message="Custom error message") + class AssertLessEqualTest(test.TestCase): @@ -569,6 +578,12 @@ class AssertLessEqualTest(test.TestCase): out = array_ops.identity(larry) self.evaluate(out) + def test_static_check_in_graph_mode(self): + with context.graph_mode(): + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Custom error message"): + check_ops.assert_less_equal(1, 0, message="Custom error message") + class AssertGreaterTest(test.TestCase): @@ -630,6 +645,12 @@ class AssertGreaterTest(test.TestCase): out = array_ops.identity(larry) self.evaluate(out) + def test_static_check_in_graph_mode(self): + with context.graph_mode(): + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Custom error message"): + check_ops.assert_greater(0, 1, message="Custom error message") + class AssertGreaterEqualTest(test.TestCase): @@ -695,6 +716,12 @@ class AssertGreaterEqualTest(test.TestCase): out = array_ops.identity(larry) self.evaluate(out) + def test_static_check_in_graph_mode(self): + with context.graph_mode(): + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Custom error message"): + check_ops.assert_greater_equal(0, 1, message="Custom error message") + class AssertNegativeTest(test.TestCase): @@ -734,6 +761,12 @@ class AssertNegativeTest(test.TestCase): out = array_ops.identity(empty) self.evaluate(out) + def test_static_check_in_graph_mode(self): + with context.graph_mode(): + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Custom error message"): + check_ops.assert_negative(1, message="Custom error message") + class AssertPositiveTest(test.TestCase): @@ -773,6 +806,12 @@ class AssertPositiveTest(test.TestCase): out = array_ops.identity(empty) self.evaluate(out) + def test_static_check_in_graph_mode(self): + with context.graph_mode(): + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Custom error message"): + check_ops.assert_positive(-1, message="Custom error message") + class EnsureShapeTest(test.TestCase): @@ -1281,6 +1320,13 @@ class AssertNonNegativeTest(test.TestCase): out = array_ops.identity(empty) self.evaluate(out) + def test_static_check_in_graph_mode(self): + with context.graph_mode(): + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Custom error message"): + check_ops.assert_non_negative(-1, message="Custom error message") + + class AssertNonPositiveTest(test.TestCase): @@ -1310,6 +1356,13 @@ class AssertNonPositiveTest(test.TestCase): out = array_ops.identity(empty) self.evaluate(out) + def test_static_check_in_graph_mode(self): + with context.graph_mode(): + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Custom error message"): + check_ops.assert_non_positive(1, message="Custom error message") + + class AssertIntegerTest(test.TestCase): diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py index 40b111ea0c2..d5bb01e604f 100644 --- a/tensorflow/python/ops/check_ops.py +++ b/tensorflow/python/ops/check_ops.py @@ -91,6 +91,260 @@ def _shape_and_dtype_str(tensor): """Returns a string containing tensor's shape and dtype.""" return 'shape=%s dtype=%s' % (tensor.shape, tensor.dtype.name) +def _unary_assert_doc(sym, sym_name): + """ + Common docstring for assert_* ops that evaluate a unary predicate over every + element of a tensor. + + Args: + sym: Mathematical symbol for the check performed on each element, i.e. + "> 0" + sym_name: English-language name for the op described by sym + """ + def _decorator(func): + opname = func.__name__ + cap_sym_name = sym_name.capitalize() + + func.__doc__ = """ + Assert the condition `x {sym}` holds element-wise. + + When running in graph mode, you should add a dependency on this operation + to ensure that it runs. Example of adding a dependency to an operation: + + ```python + with tf.control_dependencies([tf.debugging.{opname}(x, y)]): + output = tf.reduce_sum(x) + ``` + + {sym_name} means, for every element `x[i]` of `x`, we have `x[i] {sym}`. + If `x` is empty this is trivially satisfied. + + Args: + x: Numeric `Tensor`. + data: The tensors to print out if the condition is False. Defaults to + error message and first few entries of `x`. + summarize: Print this many entries of each tensor. + message: A string to prefix to the default message. + name: A name for this operation (optional). Defaults to "{opname}". + + Returns: + Op that raises `InvalidArgumentError` if `x {sym}` is False. + @compatibility{{eager}} + returns None + @end_compatibility + + Raises: + InvalidArgumentError: if the check can be performed immediately and + `x {sym}` is False. The check can be performed immediately during + eager execution or if `x` is statically known. + """.format(sym=sym, sym_name=cap_sym_name, opname=opname) + return func + + return _decorator + + +def _binary_assert_doc(sym): + """ + Common docstring for most of the assert_* ops that compare two tensors + element-wise. + + Args: + sym: Binary operation symbol, i.e. "==" + """ + def _decorator(func): + opname = func.__name__ + + func.__doc__ = """ + Assert the condition `x {sym} y` holds element-wise. + + This condition holds if for every pair of (possibly broadcast) elements + `x[i]`, `y[i]`, we have `x[i] {sym} y[i]`. + If both `x` and `y` are empty, this is trivially satisfied. + + When running in graph mode, you should add a dependency on this operation + to ensure that it runs. Example of adding a dependency to an operation: + + ```python + with tf.control_dependencies([tf.debugging.{opname}(x, y)]): + output = tf.reduce_sum(x) + ``` + + Args: + x: Numeric `Tensor`. + y: Numeric `Tensor`, same dtype as and broadcastable to `x`. + data: The tensors to print out if the condition is False. Defaults to + error message and first few entries of `x`, `y`. + summarize: Print this many entries of each tensor. + message: A string to prefix to the default message. + name: A name for this operation (optional). Defaults to "{opname}". + + Returns: + Op that raises `InvalidArgumentError` if `x {sym} y` is False. + @compatibility{{eager}} + returns None + @end_compatibility + + Raises: + InvalidArgumentError: if the check can be performed immediately and + `x {sym} y` is False. The check can be performed immediately during + eager execution or if `x` and `y` are statically known. + """.format(sym=sym, opname=opname) + return func + + return _decorator + + +def _make_assert_msg_data(sym, x, y, summarize, test_op): + """ + Subroutine of _binary_assert that generates the components of the default + error message when running in eager mode. + + Args: + sym: Mathematical symbol for the test to apply to pairs of tensor + elements, i.e. "==" + x, y: Inputs to the assertion after convert_to_tensor() + summarize: Value of the "summarize" parameter to the original assert_* + call; tells how many elements of each tensor to print. + test_op: TensorFlow op that returns a Boolean tensor with True in each + position where the assertion is satisfied. + + Returns: + List of tensors and scalars that, when stringified and concatenated, + will produce the error message string. + """ + # Prepare a message with first elements of x and y. + data = [] + + data.append('Condition x %s y did not hold.' % sym) + + if summarize > 0: + if x.shape == y.shape and x.shape.as_list(): + # If the shapes of x and y are the same (and not scalars), + # Get the values that actually differed and their indices. + # If shapes are different this information is more confusing + # than useful. + mask = math_ops.logical_not(test_op) + indices = array_ops.where(mask) + indices_np = indices.numpy() + x_vals = array_ops.boolean_mask(x, mask) + y_vals = array_ops.boolean_mask(y, mask) + num_vals = min(summarize, indices_np.shape[0]) + data.append('Indices of first %d different values:' % num_vals) + data.append(indices_np[:num_vals]) + data.append('Corresponding x values:') + data.append(x_vals.numpy().reshape((-1,))[:num_vals]) + data.append('Corresponding y values:') + data.append(y_vals.numpy().reshape((-1,))[:num_vals]) + + if summarize > 0: + # reshape((-1,)) is the fastest way to get a flat array view. + x_np = x.numpy().reshape((-1,)) + y_np = y.numpy().reshape((-1,)) + x_sum = min(x_np.size, summarize) + y_sum = min(y_np.size, summarize) + data.append('First %d elements of x:' % x_sum) + data.append(x_np[:x_sum]) + data.append('First %d elements of y:' % y_sum) + data.append(y_np[:y_sum]) + + return data + + +def _pretty_print(data_item, summarize): + """ + Format a data item for use in an error message in eager mode. + + Args: + data_item: One of the items in the "data" argument to an assert_* + function. Can be a Tensor or a scalar value. + summarize: How many elements to retain of each tensor-valued entry + in data. + + Returns an appropriate string representation of data_item + """ + if isinstance(data_item, ops.Tensor): + arr = data_item.numpy() + if np.isscalar(arr): + # Tensor.numpy() returns a scalar for zero-dimensional tensors + return str(arr) + else: + flat = arr.reshape((-1,)) + lst = [str(x) for x in flat[:summarize]] + if len(lst) < flat.size: + lst.append("...") + return str(lst) + else: + return str(data_item) + + +def _binary_assert(sym, opname, op_func, static_func, + x, y, data, summarize, message, name): + """ + Generic binary elementwise assertion. Implements the behavior described + in _binary_assert_doc() above. + + Args: + sym: Mathematical symbol for the test to apply to pairs of tensor + elements, i.e. "==" + opname: Name of the assert op in the public API, i.e. "assert_equal" + op_func: Function that, if passed the two Tensor inputs to the + assertion (x and y), will return the test to be passed to reduce_all() + i.e. + static_func: Function that, if passed numpy ndarray versions of the two + inputs to the assertion, will return a Boolean ndarray with containing + True in all positions where the assertion PASSES. + i.e. lambda x,y: (x == y) for assert_equal() + x, y, data, summarize, message, name: See doc in _binary_assert_doc + above. + + Returns: + See doc in _binary_assert_doc(). + """ + with ops.name_scope(name, opname, [x, y, data]): + x = ops.convert_to_tensor(x, name='x') + y = ops.convert_to_tensor(y, name='y') + + if context.executing_eagerly(): + test_op = op_func(x, y) + condition = math_ops.reduce_all(test_op) + if condition: + return + else: + # Default to printing 3 elements like control_flow_ops.Assert (used + # by graph mode) does. Also treat negative values as "print + # everything" for consistency with Tensor::SummarizeValue(). + if summarize is None: + summarize = 3 + elif summarize < 0: + summarize = 1e9 # Code below will find exact size of x and y. + + if data is None: + data = _make_assert_msg_data(sym, x, y, summarize, test_op) + + if message is not None: + data = [message] + list(data) + + raise errors.InvalidArgumentError( + node_def=None, op=None, + message=('\n'.join([_pretty_print(d, summarize) for d in data]))) + + else: # not context.executing_eagerly() + if data is None: + data = [ + 'Condition x %s y did not hold element-wise:' % sym, + 'x (%s) = ' % x.name, x, + 'y (%s) = ' % y.name, y + ] + if message is not None: + data = [message] + list(data) + condition = math_ops.reduce_all(op_func(x, y)) + x_static = tensor_util.constant_value(x) + y_static = tensor_util.constant_value(y) + if x_static is not None and y_static is not None: + condition_static = static_func(x_static, y_static).all() + _assert_static(condition_static, data) + return control_flow_ops.Assert(condition, data, summarize=summarize) + @tf_export( 'debugging.assert_proper_iterable', @@ -127,30 +381,8 @@ def assert_proper_iterable(values): 'debugging.assert_negative', v1=['debugging.assert_negative', 'assert_negative']) @deprecation.deprecated_endpoints('assert_negative') +@_unary_assert_doc('< 0', 'negative') def assert_negative(x, data=None, summarize=None, message=None, name=None): - """Assert the condition `x < 0` holds element-wise. - - Example of adding a dependency to an operation: - - ```python - with tf.control_dependencies([tf.assert_negative(x)]): - output = tf.reduce_sum(x) - ``` - - Negative means, for every element `x[i]` of `x`, we have `x[i] < 0`. - If `x` is empty this is trivially satisfied. - - Args: - x: Numeric `Tensor`. - data: The tensors to print out if the condition is False. Defaults to - error message and first few entries of `x`. - summarize: Print this many entries of each tensor. - message: A string to prefix to the default message. - name: A name for this operation (optional). Defaults to "assert_negative". - - Returns: - Op raising `InvalidArgumentError` unless `x` is all negative. - """ message = message or '' with ops.name_scope(name, 'assert_negative', [x, data]): x = ops.convert_to_tensor(x, name='x') @@ -171,30 +403,8 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None): 'debugging.assert_positive', v1=['debugging.assert_positive', 'assert_positive']) @deprecation.deprecated_endpoints('assert_positive') +@_unary_assert_doc('> 0', 'positive') def assert_positive(x, data=None, summarize=None, message=None, name=None): - """Assert the condition `x > 0` holds element-wise. - - Example of adding a dependency to an operation: - - ```python - with tf.control_dependencies([tf.assert_positive(x)]): - output = tf.reduce_sum(x) - ``` - - Positive means, for every element `x[i]` of `x`, we have `x[i] > 0`. - If `x` is empty this is trivially satisfied. - - Args: - x: Numeric `Tensor`. - data: The tensors to print out if the condition is False. Defaults to - error message and first few entries of `x`. - summarize: Print this many entries of each tensor. - message: A string to prefix to the default message. - name: A name for this operation (optional). Defaults to "assert_positive". - - Returns: - Op raising `InvalidArgumentError` unless `x` is all positive. - """ message = message or '' with ops.name_scope(name, 'assert_positive', [x, data]): x = ops.convert_to_tensor(x, name='x') @@ -214,31 +424,8 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None): 'debugging.assert_non_negative', v1=['debugging.assert_non_negative', 'assert_non_negative']) @deprecation.deprecated_endpoints('assert_non_negative') +@_unary_assert_doc('>= 0', 'non-negative') def assert_non_negative(x, data=None, summarize=None, message=None, name=None): - """Assert the condition `x >= 0` holds element-wise. - - Example of adding a dependency to an operation: - - ```python - with tf.control_dependencies([tf.assert_non_negative(x)]): - output = tf.reduce_sum(x) - ``` - - Non-negative means, for every element `x[i]` of `x`, we have `x[i] >= 0`. - If `x` is empty this is trivially satisfied. - - Args: - x: Numeric `Tensor`. - data: The tensors to print out if the condition is False. Defaults to - error message and first few entries of `x`. - summarize: Print this many entries of each tensor. - message: A string to prefix to the default message. - name: A name for this operation (optional). - Defaults to "assert_non_negative". - - Returns: - Op raising `InvalidArgumentError` unless `x` is all non-negative. - """ message = message or '' with ops.name_scope(name, 'assert_non_negative', [x, data]): x = ops.convert_to_tensor(x, name='x') @@ -259,31 +446,8 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None): 'debugging.assert_non_positive', v1=['debugging.assert_non_positive', 'assert_non_positive']) @deprecation.deprecated_endpoints('assert_non_positive') +@_unary_assert_doc('<= 0', 'non-positive') def assert_non_positive(x, data=None, summarize=None, message=None, name=None): - """Assert the condition `x <= 0` holds element-wise. - - Example of adding a dependency to an operation: - - ```python - with tf.control_dependencies([tf.assert_non_positive(x)]): - output = tf.reduce_sum(x) - ``` - - Non-positive means, for every element `x[i]` of `x`, we have `x[i] <= 0`. - If `x` is empty this is trivially satisfied. - - Args: - x: Numeric `Tensor`. - data: The tensors to print out if the condition is False. Defaults to - error message and first few entries of `x`. - summarize: Print this many entries of each tensor. - message: A string to prefix to the default message. - name: A name for this operation (optional). - Defaults to "assert_non_positive". - - Returns: - Op raising `InvalidArgumentError` unless `x` is all non-positive. - """ message = message or '' with ops.name_scope(name, 'assert_non_positive', [x, data]): x = ops.convert_to_tensor(x, name='x') @@ -301,157 +465,25 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None): @tf_export('debugging.assert_equal', 'assert_equal') +@_binary_assert_doc('==') def assert_equal(x, y, data=None, summarize=None, message=None, name=None): - """Assert the condition `x == y` holds element-wise. - - Example of adding a dependency to an operation: - - ```python - with tf.control_dependencies([tf.assert_equal(x, y)]): - output = tf.reduce_sum(x) - ``` - - This condition holds if for every pair of (possibly broadcast) elements - `x[i]`, `y[i]`, we have `x[i] == y[i]`. - If both `x` and `y` are empty, this is trivially satisfied. - - Args: - x: Numeric `Tensor`. - y: Numeric `Tensor`, same dtype as and broadcastable to `x`. - data: The tensors to print out if the condition is False. Defaults to - error message and first few entries of `x`, `y`. - summarize: Print this many entries of each tensor. - message: A string to prefix to the default message. - name: A name for this operation (optional). Defaults to "assert_equal". - - Returns: - Op that raises `InvalidArgumentError` if `x == y` is False. - @compatibility{eager} returns None - - Raises: - InvalidArgumentError: if the check can be performed immediately and - `x == y` is False. The check can be performed immediately during eager - execution or if `x` and `y` are statically known. - """ - message = message or '' - with ops.name_scope(name, 'assert_equal', [x, y, data]): - x = ops.convert_to_tensor(x, name='x') - y = ops.convert_to_tensor(y, name='y') - - if context.executing_eagerly(): - eq = math_ops.equal(x, y) - condition = math_ops.reduce_all(eq) - if not condition: - # Prepare a message with first elements of x and y. - summary_msg = '' - # Default to printing 3 elements like control_flow_ops.Assert (used - # by graph mode) does. - summarize = 3 if summarize is None else summarize - if summarize: - # reshape((-1,)) is the fastest way to get a flat array view. - x_np = x.numpy().reshape((-1,)) - y_np = y.numpy().reshape((-1,)) - x_sum = min(x_np.size, summarize) - y_sum = min(y_np.size, summarize) - summary_msg = ('First %d elements of x:\n%s\n' - 'First %d elements of y:\n%s\n' % - (x_sum, x_np[:x_sum], - y_sum, y_np[:y_sum])) - - index_and_values_str = '' - if x.shape == y.shape and x.shape.as_list(): - # If the shapes of x and y are the same (and not scalars), - # Get the values that actually differed and their indices. - # If shapes are different this information is more confusing - # than useful. - mask = math_ops.logical_not(eq) - indices = array_ops.where(mask) - indices_np = indices.numpy() - x_vals = array_ops.boolean_mask(x, mask) - y_vals = array_ops.boolean_mask(y, mask) - summarize = min(summarize, indices_np.shape[0]) - index_and_values_str = ( - 'Indices of first %s different values:\n%s\n' - 'Corresponding x values:\n%s\n' - 'Corresponding y values:\n%s\n' % - (summarize, indices_np[:summarize], - x_vals.numpy().reshape((-1,))[:summarize], - y_vals.numpy().reshape((-1,))[:summarize])) - - raise errors.InvalidArgumentError( - node_def=None, op=None, - message=('%s\nCondition x == y did not hold.\n%s%s' % - (message or '', index_and_values_str, summary_msg))) - return - - if data is None: - data = [ - message, - 'Condition x == y did not hold element-wise:', - 'x (%s) = ' % x.name, x, - 'y (%s) = ' % y.name, y - ] - condition = math_ops.reduce_all(math_ops.equal(x, y)) - x_static = tensor_util.constant_value(x) - y_static = tensor_util.constant_value(y) - if x_static is not None and y_static is not None: - condition_static = (x_static == y_static).all() - _assert_static(condition_static, data) - return control_flow_ops.Assert(condition, data, summarize=summarize) + return _binary_assert('==', 'assert_equal', + math_ops.equal, + lambda x, y: (x == y), + x, y, data, summarize, message, name) @tf_export( 'debugging.assert_none_equal', v1=['debugging.assert_none_equal', 'assert_none_equal']) @deprecation.deprecated_endpoints('assert_none_equal') +@_binary_assert_doc('!=') def assert_none_equal( x, y, data=None, summarize=None, message=None, name=None): - """Assert the condition `x != y` holds for all elements. - - Example of adding a dependency to an operation: - - ```python - with tf.control_dependencies([tf.assert_none_equal(x, y)]): - output = tf.reduce_sum(x) - ``` - - This condition holds if for every pair of (possibly broadcast) elements - `x[i]`, `y[i]`, we have `x[i] != y[i]`. - If both `x` and `y` are empty, this is trivially satisfied. - - Args: - x: Numeric `Tensor`. - y: Numeric `Tensor`, same dtype as and broadcastable to `x`. - data: The tensors to print out if the condition is False. Defaults to - error message and first few entries of `x`, `y`. - summarize: Print this many entries of each tensor. - message: A string to prefix to the default message. - name: A name for this operation (optional). - Defaults to "assert_none_equal". - - Returns: - Op that raises `InvalidArgumentError` if `x != y` is ever False. - """ - message = message or '' - with ops.name_scope(name, 'assert_none_equal', [x, y, data]): - x = ops.convert_to_tensor(x, name='x') - y = ops.convert_to_tensor(y, name='y') - if context.executing_eagerly(): - x_name = _shape_and_dtype_str(x) - y_name = _shape_and_dtype_str(y) - else: - x_name = x.name - y_name = y.name - - if data is None: - data = [ - message, - 'Condition x != y did not hold for every single element:', - 'x (%s) = ' % x_name, x, - 'y (%s) = ' % y_name, y - ] - condition = math_ops.reduce_all(math_ops.not_equal(x, y)) - return control_flow_ops.Assert(condition, data, summarize=summarize) + return _binary_assert('!=', 'assert_none_equal', + math_ops.not_equal, + lambda x, y: (x != y), + x, y, data, summarize, message, name) @tf_export('debugging.assert_near', v1=['debugging.assert_near', 'assert_near']) @@ -534,203 +566,46 @@ def assert_near( @tf_export('debugging.assert_less', 'assert_less') +@_binary_assert_doc('<') def assert_less(x, y, data=None, summarize=None, message=None, name=None): - """Assert the condition `x < y` holds element-wise. - - Example of adding a dependency to an operation: - - ```python - with tf.control_dependencies([tf.assert_less(x, y)]): - output = tf.reduce_sum(x) - ``` - - This condition holds if for every pair of (possibly broadcast) elements - `x[i]`, `y[i]`, we have `x[i] < y[i]`. - If both `x` and `y` are empty, this is trivially satisfied. - - Args: - x: Numeric `Tensor`. - y: Numeric `Tensor`, same dtype as and broadcastable to `x`. - data: The tensors to print out if the condition is False. Defaults to - error message and first few entries of `x`, `y`. - summarize: Print this many entries of each tensor. - message: A string to prefix to the default message. - name: A name for this operation (optional). Defaults to "assert_less". - - Returns: - Op that raises `InvalidArgumentError` if `x < y` is False. - """ - message = message or '' - with ops.name_scope(name, 'assert_less', [x, y, data]): - x = ops.convert_to_tensor(x, name='x') - y = ops.convert_to_tensor(y, name='y') - if context.executing_eagerly(): - x_name = _shape_and_dtype_str(x) - y_name = _shape_and_dtype_str(y) - else: - x_name = x.name - y_name = y.name - - if data is None: - data = [ - message, - 'Condition x < y did not hold element-wise:', - 'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y - ] - condition = math_ops.reduce_all(math_ops.less(x, y)) - return control_flow_ops.Assert(condition, data, summarize=summarize) + return _binary_assert('<', 'assert_less', + math_ops.less, + lambda x, y: (x < y), + x, y, data, summarize, message, name) @tf_export( 'debugging.assert_less_equal', v1=['debugging.assert_less_equal', 'assert_less_equal']) @deprecation.deprecated_endpoints('assert_less_equal') +@_binary_assert_doc('<=') def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None): - """Assert the condition `x <= y` holds element-wise. - - Example of adding a dependency to an operation: - - ```python - with tf.control_dependencies([tf.assert_less_equal(x, y)]): - output = tf.reduce_sum(x) - ``` - - This condition holds if for every pair of (possibly broadcast) elements - `x[i]`, `y[i]`, we have `x[i] <= y[i]`. - If both `x` and `y` are empty, this is trivially satisfied. - - Args: - x: Numeric `Tensor`. - y: Numeric `Tensor`, same dtype as and broadcastable to `x`. - data: The tensors to print out if the condition is False. Defaults to - error message and first few entries of `x`, `y`. - summarize: Print this many entries of each tensor. - message: A string to prefix to the default message. - name: A name for this operation (optional). Defaults to "assert_less_equal" - - Returns: - Op that raises `InvalidArgumentError` if `x <= y` is False. - """ - message = message or '' - with ops.name_scope(name, 'assert_less_equal', [x, y, data]): - x = ops.convert_to_tensor(x, name='x') - y = ops.convert_to_tensor(y, name='y') - if context.executing_eagerly(): - x_name = _shape_and_dtype_str(x) - y_name = _shape_and_dtype_str(y) - else: - x_name = x.name - y_name = y.name - - if data is None: - data = [ - message, - 'Condition x <= y did not hold element-wise:' - 'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y - ] - condition = math_ops.reduce_all(math_ops.less_equal(x, y)) - return control_flow_ops.Assert(condition, data, summarize=summarize) + return _binary_assert('<=', 'assert_less_equal', + math_ops.less_equal, + lambda x, y: (x <= y), + x, y, data, summarize, message, name) @tf_export('debugging.assert_greater', 'assert_greater') +@_binary_assert_doc('>') def assert_greater(x, y, data=None, summarize=None, message=None, name=None): - """Assert the condition `x > y` holds element-wise. - - Example of adding a dependency to an operation: - - ```python - with tf.control_dependencies([tf.assert_greater(x, y)]): - output = tf.reduce_sum(x) - ``` - - This condition holds if for every pair of (possibly broadcast) elements - `x[i]`, `y[i]`, we have `x[i] > y[i]`. - If both `x` and `y` are empty, this is trivially satisfied. - - Args: - x: Numeric `Tensor`. - y: Numeric `Tensor`, same dtype as and broadcastable to `x`. - data: The tensors to print out if the condition is False. Defaults to - error message and first few entries of `x`, `y`. - summarize: Print this many entries of each tensor. - message: A string to prefix to the default message. - name: A name for this operation (optional). Defaults to "assert_greater". - - Returns: - Op that raises `InvalidArgumentError` if `x > y` is False. - """ - message = message or '' - with ops.name_scope(name, 'assert_greater', [x, y, data]): - x = ops.convert_to_tensor(x, name='x') - y = ops.convert_to_tensor(y, name='y') - if context.executing_eagerly(): - x_name = _shape_and_dtype_str(x) - y_name = _shape_and_dtype_str(y) - else: - x_name = x.name - y_name = y.name - - if data is None: - data = [ - message, - 'Condition x > y did not hold element-wise:' - 'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y - ] - condition = math_ops.reduce_all(math_ops.greater(x, y)) - return control_flow_ops.Assert(condition, data, summarize=summarize) + return _binary_assert('>', 'assert_greater', + math_ops.greater, + lambda x, y: (x > y), + x, y, data, summarize, message, name) @tf_export( 'debugging.assert_greater_equal', v1=['debugging.assert_greater_equal', 'assert_greater_equal']) @deprecation.deprecated_endpoints('assert_greater_equal') +@_binary_assert_doc('>=') def assert_greater_equal(x, y, data=None, summarize=None, message=None, name=None): - """Assert the condition `x >= y` holds element-wise. - - Example of adding a dependency to an operation: - - ```python - with tf.control_dependencies([tf.assert_greater_equal(x, y)]): - output = tf.reduce_sum(x) - ``` - - This condition holds if for every pair of (possibly broadcast) elements - `x[i]`, `y[i]`, we have `x[i] >= y[i]`. - If both `x` and `y` are empty, this is trivially satisfied. - - Args: - x: Numeric `Tensor`. - y: Numeric `Tensor`, same dtype as and broadcastable to `x`. - data: The tensors to print out if the condition is False. Defaults to - error message and first few entries of `x`, `y`. - summarize: Print this many entries of each tensor. - message: A string to prefix to the default message. - name: A name for this operation (optional). Defaults to - "assert_greater_equal" - - Returns: - Op that raises `InvalidArgumentError` if `x >= y` is False. - """ - message = message or '' - with ops.name_scope(name, 'assert_greater_equal', [x, y, data]): - x = ops.convert_to_tensor(x, name='x') - y = ops.convert_to_tensor(y, name='y') - if context.executing_eagerly(): - x_name = _shape_and_dtype_str(x) - y_name = _shape_and_dtype_str(y) - else: - x_name = x.name - y_name = y.name - - if data is None: - data = [ - message, - 'Condition x >= y did not hold element-wise:' - 'x (%s) = ' % x_name, x, 'y (%s) = ' % y_name, y - ] - condition = math_ops.reduce_all(math_ops.greater_equal(x, y)) - return control_flow_ops.Assert(condition, data, summarize=summarize) + return _binary_assert('>=', 'assert_greater_equal', + math_ops.greater_equal, + lambda x, y: (x >= y), + x, y, data, summarize, message, name) def _assert_rank_condition( From d64a833ae1f307d904702e6fcbdd2d99db7c4a1c Mon Sep 17 00:00:00 2001 From: frreiss Date: Fri, 19 Oct 2018 13:30:30 -0700 Subject: [PATCH 0007/3053] Address current review comments --- .../python/kernel_tests/check_ops_test.py | 2 +- tensorflow/python/ops/check_ops.py | 35 ++++++++++--------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py index 90514e3976b..197189f9906 100644 --- a/tensorflow/python/kernel_tests/check_ops_test.py +++ b/tensorflow/python/kernel_tests/check_ops_test.py @@ -513,7 +513,7 @@ class AssertLessTest(test.TestCase): with context.graph_mode(): with self.assertRaisesRegexp(errors.InvalidArgumentError, "Custom error message"): - check_ops.assert_none_equal(1, 1, message="Custom error message") + check_ops.assert_less(1, 1, message="Custom error message") class AssertLessEqualTest(test.TestCase): diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py index d5bb01e604f..96f1c3b5854 100644 --- a/tensorflow/python/ops/check_ops.py +++ b/tensorflow/python/ops/check_ops.py @@ -309,24 +309,25 @@ def _binary_assert(sym, opname, op_func, static_func, condition = math_ops.reduce_all(test_op) if condition: return - else: - # Default to printing 3 elements like control_flow_ops.Assert (used - # by graph mode) does. Also treat negative values as "print - # everything" for consistency with Tensor::SummarizeValue(). - if summarize is None: - summarize = 3 - elif summarize < 0: - summarize = 1e9 # Code below will find exact size of x and y. + + # If we get here, the assertion has failed. + # Default to printing 3 elements like control_flow_ops.Assert (used + # by graph mode) does. Also treat negative values as "print + # everything" for consistency with Tensor::SummarizeValue(). + if summarize is None: + summarize = 3 + elif summarize < 0: + summarize = 1e9 # Code below will find exact size of x and y. - if data is None: - data = _make_assert_msg_data(sym, x, y, summarize, test_op) - - if message is not None: - data = [message] + list(data) - - raise errors.InvalidArgumentError( - node_def=None, op=None, - message=('\n'.join([_pretty_print(d, summarize) for d in data]))) + if data is None: + data = _make_assert_msg_data(sym, x, y, summarize, test_op) + + if message is not None: + data = [message] + list(data) + + raise errors.InvalidArgumentError( + node_def=None, op=None, + message=('\n'.join([_pretty_print(d, summarize) for d in data]))) else: # not context.executing_eagerly() if data is None: From c3f713e1cc1d1c14ce9e19a792e4179ca3fc92bf Mon Sep 17 00:00:00 2001 From: frreiss Date: Fri, 19 Oct 2018 14:34:28 -0700 Subject: [PATCH 0008/3053] Address review comment on new PR #23109 --- tensorflow/python/ops/check_ops.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py index 96f1c3b5854..382bf882850 100644 --- a/tensorflow/python/ops/check_ops.py +++ b/tensorflow/python/ops/check_ops.py @@ -236,7 +236,6 @@ def _make_assert_msg_data(sym, x, y, summarize, test_op): data.append('Corresponding y values:') data.append(y_vals.numpy().reshape((-1,))[:num_vals]) - if summarize > 0: # reshape((-1,)) is the fastest way to get a flat array view. x_np = x.numpy().reshape((-1,)) y_np = y.numpy().reshape((-1,)) From 400f08b5657e2f8958f921959ad38d9d03dbec24 Mon Sep 17 00:00:00 2001 From: frreiss Date: Wed, 31 Oct 2018 14:11:43 -0700 Subject: [PATCH 0009/3053] Fixed regression tests under contrib --- .../kernel_tests/bijectors/affine_test.py | 16 +++++++------ .../kernel_tests/bijectors/reshape_test.py | 18 ++++++++++++--- .../kernel_tests/bijectors/softplus_test.py | 8 ++++--- .../python/kernel_tests/cauchy_test.py | 8 ++++--- .../python/kernel_tests/deterministic_test.py | 23 ++++++++++--------- .../python/kernel_tests/half_normal_test.py | 8 ++++--- .../python/kernel_tests/inverse_gamma_test.py | 19 ++++++++------- .../quantized_distribution_test.py | 18 +++++++-------- .../kernel_tests/relaxed_bernoulli_test.py | 11 ++++----- .../metrics/python/ops/metric_ops_test.py | 7 +++--- 10 files changed, 80 insertions(+), 56 deletions(-) diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py index dc18eb3df69..cfb342049f2 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py @@ -24,6 +24,7 @@ import numpy as np from tensorflow.contrib.distributions.python.ops.bijectors.affine import Affine from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.ops import array_ops from tensorflow.python.platform import test @@ -461,13 +462,14 @@ class AffineBijectorTest(test.TestCase): def testNoBatchMultivariateRaisesWhenSingular(self): with self.cached_session(): mu = [1., -1] - bijector = Affine( - shift=mu, - # Has zero on the diagonal. - scale_diag=[0., 1], - validate_args=True) - with self.assertRaisesOpError("diagonal part must be non-zero"): - bijector.forward([1., 1.]).eval() + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "diagonal part must be non-zero"): + bijector = Affine( + shift=mu, + # Has zero on the diagonal. + scale_diag=[0., 1], + validate_args=True) + # Error detected statically; don't need to run the op. def _makeScale(self, x, diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py index 79eadf524b5..160d5794efc 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py @@ -22,6 +22,7 @@ import numpy as np from tensorflow.contrib.distributions.python.ops.bijectors.reshape import Reshape from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite @@ -150,6 +151,17 @@ class _ReshapeBijectorTest(object): with self.assertRaisesError(expected_error_message): sess.run(bijector.forward_event_shape_tensor(shape_in), feed_dict=feed_dict) + + def _testInvalidDimensionsStatic(self, expected_error_message): + """Version of _testInvalidDimensionsOpError for errors detected statically + at graph construction time.""" + shape_in, shape_out, feed_dict = self.build_shapes([2, 3], [1, 2, -2,]) + with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError, + expected_error_message): + bijector = Reshape( + event_shape_out=shape_out, + event_shape_in=shape_in, + validate_args=True) # pylint: enable=invalid-name def testValidButNonMatchingInputOpError(self): @@ -300,9 +312,9 @@ class ReshapeBijectorTestStatic(test.TestCase, _ReshapeBijectorTest): assert_bijective_and_finite( bijector, x, y, event_ndims=2, rtol=1e-6, atol=0) - def testInvalidDimensionsOpError(self): - self._testInvalidDimensionsOpError( - "Invalid value in tensor used for shape: -2") + def testInvalidDimensionsStatic(self): + self._testInvalidDimensionsStatic( + "elements must be either positive integers or `-1`") def testInputOutputMismatchOpError(self): self._testInputOutputMismatchOpError("Cannot reshape a tensor with") diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py index e805619041d..d8484ba22fd 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py @@ -21,6 +21,7 @@ from __future__ import print_function import numpy as np from tensorflow.contrib.distributions.python.ops.bijectors.softplus import Softplus +from tensorflow.python.framework import errors from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency from tensorflow.python.platform import test @@ -43,9 +44,10 @@ class SoftplusBijectorTest(test.TestCase): def testHingeSoftnessZeroRaises(self): with self.cached_session(): - bijector = Softplus(hinge_softness=0., validate_args=True) - with self.assertRaisesOpError("must be non-zero"): - bijector.forward([1., 1.]).eval() + with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError, + "must be non-zero"): + bijector = Softplus(hinge_softness=0., validate_args=True) + # Error detected statically; don't need to run op. def testBijectorForwardInverseEventDimsZero(self): with self.cached_session(): diff --git a/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py b/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py index 4411d6f4611..353836fb75c 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py @@ -24,6 +24,7 @@ import numpy as np from tensorflow.contrib.distributions.python.ops import cauchy as cauchy_lib from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops @@ -400,9 +401,10 @@ class CauchyTest(test.TestCase): def testCauchyNegativeLocFails(self): with self.cached_session(): - cauchy = cauchy_lib.Cauchy(loc=[1.], scale=[-5.], validate_args=True) - with self.assertRaisesOpError("Condition x > 0 did not hold"): - cauchy.mode().eval() + with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError, + "Condition x > 0 did not hold"): + cauchy = cauchy_lib.Cauchy(loc=[1.], scale=[-5.], validate_args=True) + # Error detected statically; no need for cauchy.mode().eval() def testCauchyShape(self): with self.cached_session(): diff --git a/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py b/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py index 36fc7a70c8a..568ee8f20ff 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py @@ -19,6 +19,7 @@ from __future__ import print_function import numpy as np from tensorflow.contrib.distributions.python.ops import deterministic as deterministic_lib +from tensorflow.python.framework import errors from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops from tensorflow.python.platform import test @@ -40,11 +41,11 @@ class DeterministicTest(test.TestCase): def testInvalidTolRaises(self): loc = rng.rand(2, 3, 4).astype(np.float32) - deterministic = deterministic_lib.Deterministic( - loc, atol=-1, validate_args=True) - with self.cached_session(): - with self.assertRaisesOpError("Condition x >= 0"): - deterministic.prob(0.).eval() + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Condition x >= 0"): + deterministic = deterministic_lib.Deterministic( + loc, atol=-1, validate_args=True) + # Error detected statically; no need for deterministic.prob(0.).eval() def testProbWithNoBatchDimsIntegerType(self): deterministic = deterministic_lib.Deterministic(0) @@ -195,16 +196,16 @@ class VectorDeterministicTest(test.TestCase): def testInvalidTolRaises(self): loc = rng.rand(2, 3, 4).astype(np.float32) - deterministic = deterministic_lib.VectorDeterministic( - loc, atol=-1, validate_args=True) - with self.cached_session(): - with self.assertRaisesOpError("Condition x >= 0"): - deterministic.prob(loc).eval() + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Condition x >= 0"): + deterministic = deterministic_lib.VectorDeterministic( + loc, atol=-1, validate_args=True) + # Error detected statically; no need for deterministic.prob(loc).eval() def testInvalidXRaises(self): loc = rng.rand(2, 3, 4).astype(np.float32) deterministic = deterministic_lib.VectorDeterministic( - loc, atol=-1, validate_args=True) + loc, atol=None, validate_args=True) with self.cached_session(): with self.assertRaisesRegexp(ValueError, "must have rank at least 1"): deterministic.prob(0.).eval() diff --git a/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py b/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py index 686de9d2465..a1b8a9e181f 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py @@ -24,6 +24,7 @@ import numpy as np from tensorflow.contrib.distributions.python.ops import half_normal as hn_lib from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops @@ -288,9 +289,10 @@ class HalfNormalTest(test.TestCase): def testNegativeSigmaFails(self): with self.cached_session(): - halfnorm = hn_lib.HalfNormal(scale=[-5.], validate_args=True, name="G") - with self.assertRaisesOpError("Condition x > 0 did not hold"): - halfnorm.mean().eval() + with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError, + "Condition x > 0 did not hold"): + halfnorm = hn_lib.HalfNormal(scale=[-5.], validate_args=True, name="G") + # Error detected statically; no need for halfnorm.mean().eval() def testHalfNormalShape(self): with self.cached_session(): diff --git a/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py b/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py index 70551d89d9c..8ba791cad7d 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py @@ -22,6 +22,7 @@ from scipy import stats from tensorflow.contrib.distributions.python.ops import inverse_gamma from tensorflow.python.client import session from tensorflow.python.framework import constant_op +from tensorflow.python.framework import errors from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import nn_ops from tensorflow.python.platform import test @@ -295,16 +296,18 @@ class InverseGammaTest(test.TestCase): with self.cached_session(): alpha_v = constant_op.constant(0.0, name="alpha") beta_v = constant_op.constant(1.0, name="beta") - inv_gamma = inverse_gamma.InverseGamma( - concentration=alpha_v, rate=beta_v, validate_args=True) - with self.assertRaisesOpError("alpha"): - inv_gamma.mean().eval() + with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError, + "alpha"): + inv_gamma = inverse_gamma.InverseGamma( + concentration=alpha_v, rate=beta_v, validate_args=True) + # Error detected statically; no need for inv_gamma.mean().eval() alpha_v = constant_op.constant(1.0, name="alpha") beta_v = constant_op.constant(0.0, name="beta") - inv_gamma = inverse_gamma.InverseGamma( - concentration=alpha_v, rate=beta_v, validate_args=True) - with self.assertRaisesOpError("beta"): - inv_gamma.mean().eval() + with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError, + "beta"): + inv_gamma = inverse_gamma.InverseGamma( + concentration=alpha_v, rate=beta_v, validate_args=True) + # Error detected statically; no need for inv_gamma.mean().eval() def testInverseGammaWithSoftplusConcentrationRate(self): with self.cached_session(): diff --git a/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py index 07528cafaf1..88773fb7aa0 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py @@ -21,6 +21,7 @@ import numpy as np from scipy import stats from tensorflow.contrib import distributions as distributions_lib from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import gradients_impl @@ -361,15 +362,14 @@ class QuantizedDistributionTest(test.TestCase): def testLowerCutoffMustBeBelowUpperCutoffOrWeRaise(self): with self.cached_session(): - qdist = distributions.QuantizedDistribution( - distribution=distributions.Normal(loc=0., scale=1.), - low=1., # not strictly less than high. - high=1., - validate_args=True) - - self.assertTrue(qdist.validate_args) # Default is True. - with self.assertRaisesOpError("must be strictly less"): - qdist.sample().eval() + with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError, + "must be strictly less"): + qdist = distributions.QuantizedDistribution( + distribution=distributions.Normal(loc=0., scale=1.), + low=1., # not strictly less than high. + high=1., + validate_args=True) + # Error detected statically; no need for qdist.sample().eval() def testCutoffsMustBeIntegerValuedIfValidateArgsTrue(self): with self.cached_session(): diff --git a/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py b/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py index fec23749286..85ee0095716 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py @@ -94,12 +94,11 @@ class RelaxedBernoulliTest(test.TestCase): """If validate_args, raises InvalidArgumentError when temperature is 0.""" temperature = constant_op.constant(0.0) p = constant_op.constant([0.1, 0.4]) - dist = relaxed_bernoulli.RelaxedBernoulli(temperature, probs=p, - validate_args=True) - with self.cached_session(): - sample = dist.sample() - with self.assertRaises(errors_impl.InvalidArgumentError): - sample.eval() + with self.assertRaisesWithPredicateMatch(errors_impl.InvalidArgumentError, + "x > 0 did not hold"): + dist = relaxed_bernoulli.RelaxedBernoulli(temperature, probs=p, + validate_args=True) + # Error detected statically; no need to run the op. def testDtype(self): temperature = constant_op.constant(1.0, dtype=dtypes.float32) diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py index fc64f343ab4..6c824c05419 100644 --- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py +++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py @@ -1734,9 +1734,10 @@ class StreamingAUCTest(test.TestCase): predictions = constant_op.constant( [1, -1, 1, -1], shape=(1, 4), dtype=dtypes_lib.float32) labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4)) - _, update_op = metrics.streaming_auc(predictions, labels) - sess.run(variables.local_variables_initializer()) - self.assertRaises(errors_impl.InvalidArgumentError, update_op.eval) + with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, + r"predictions must be in \[0, 1\]"): + _, update_op = metrics.streaming_auc(predictions, labels) + # Error detected statically; no need to run the op. def testAllCorrect(self): self.allCorrectAsExpected('ROC') From 48974e999f17b67b89123a883a93dc8129b53686 Mon Sep 17 00:00:00 2001 From: frreiss Date: Mon, 19 Nov 2018 20:55:06 -0800 Subject: [PATCH 0010/3053] Make regexes less strict about whitespace --- tensorflow/python/kernel_tests/check_ops_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py index 197189f9906..c14daa9bfdf 100644 --- a/tensorflow/python/kernel_tests/check_ops_test.py +++ b/tensorflow/python/kernel_tests/check_ops_test.py @@ -307,9 +307,9 @@ class AssertNoneEqualTest(test.TestCase): def test_error_message_eager(self): # Note that the following three strings are regexes - expected_error_msg_full = r"""\[0\. 1\. 2\. 3\. 4\. 5\.\]""" - expected_error_msg_default = r"""\[0\. 1\. 2\.\]""" - expected_error_msg_short = r"""\[0\. 1\.\]""" + expected_error_msg_full = r"""\[ *0\. +1\. +2\. +3\. +4\. +5\.\]""" + expected_error_msg_default = r"""\[ *0\. +1\. +2\.\]""" + expected_error_msg_short = r"""\[ *0\. +1\.\]""" with context.eager_mode(): t = constant_op.constant( np.array(range(6)), shape=[2, 3], dtype=np.float32) From 90f8ea920b082fc41d09026f6c788920d010d63f Mon Sep 17 00:00:00 2001 From: Mark Ryan Date: Wed, 20 Feb 2019 14:26:16 +0100 Subject: [PATCH 0011/3053] Fix eigen_spatial_convolutions_test benchmarks This commit fixes a crash in PackRhsHelper caused by a memory corruption error. The function contains a loop that populates two vectors, one containing input Tensors and the other containing InputMappers that point to those input Tensors. The problem is that the emplace_back call on the vector of input Tensors can cause that vector to grow which can invalidate the pointers to the previously allocated input Tensors. Unfortunately, these invalidated pointers are still used by the InputMappers in the second vector and so when we use the InputMappers we get a crash. The commit fixes the issue by reserving sufficient space in the input vector thereby preventing reallocations and invalidation of the pointers to the Input Tensors. Although the PackLhsHelper function does not crash on my machine it suffers from the same error and so this commit also contains a fix for that function. Fixes: https://github.com/tensorflow/tensorflow/issues/26251 --- .../core/kernels/eigen_spatial_convolutions_test.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc index 9aba7b63278..5fd895a09a3 100644 --- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc +++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc @@ -1468,6 +1468,10 @@ static void PackRhsHelper(int iters, std::vector evaluators; std::vector input_mappers; + inputs.reserve(num_inputs); + evaluators.reserve(num_inputs); + input_mappers.reserve(num_inputs); + for (int i = 0; i < num_inputs; ++i) { inputs.emplace_back(input_dims); inputs[i].setRandom(); @@ -1652,6 +1656,10 @@ static void PackLhsHelper(int iters, std::vector evaluators; std::vector input_mappers; + filters.reserve(num_filters); + evaluators.reserve(num_filters); + input_mappers.reserve(num_filters); + for (int i = 0; i < num_filters; ++i) { filters.emplace_back(filter_dims); filters[i].setRandom(); From b4a142283d670c89b8971d8fd7181a6f462fdd4e Mon Sep 17 00:00:00 2001 From: XinPing Wang Date: Tue, 5 Mar 2019 07:48:50 +0800 Subject: [PATCH 0012/3053] Disable NNAPI api for Raspberry Pi --- tensorflow/lite/tools/make/Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile index 8428e0d2e6b..2c7bc5757df 100644 --- a/tensorflow/lite/tools/make/Makefile +++ b/tensorflow/lite/tools/make/Makefile @@ -131,6 +131,9 @@ endif ifeq ($(TARGET),ios) BUILD_WITH_NNAPI=false endif +ifeq ($(TARGET),rpi) + BUILD_WITH_NNAPI=false +endif ifeq ($(BUILD_WITH_NNAPI),true) CORE_CC_EXCLUDE_SRCS += tensorflow/lite/nnapi_delegate_disabled.cc else From 5702bb9fd7b6696cf55e7301f9ef9d6b6926c998 Mon Sep 17 00:00:00 2001 From: XinPing Wang Date: Wed, 6 Mar 2019 12:59:26 +0800 Subject: [PATCH 0013/3053] New build target for aarch64 without NNAPI --- tensorflow/lite/tools/make/Makefile | 3 ++ .../tools/make/build_generic_aarch64_lib.sh | 22 +++++++++++++ .../make/targets/generic_aarch64_makefile.inc | 33 +++++++++++++++++++ 3 files changed, 58 insertions(+) create mode 100755 tensorflow/lite/tools/make/build_generic_aarch64_lib.sh create mode 100644 tensorflow/lite/tools/make/targets/generic_aarch64_makefile.inc diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile index 2c7bc5757df..78208a76103 100644 --- a/tensorflow/lite/tools/make/Makefile +++ b/tensorflow/lite/tools/make/Makefile @@ -134,6 +134,9 @@ endif ifeq ($(TARGET),rpi) BUILD_WITH_NNAPI=false endif +ifeq ($(TARGET),generic-aarch64) + BUILD_WITH_NNAPI=false +endif ifeq ($(BUILD_WITH_NNAPI),true) CORE_CC_EXCLUDE_SRCS += tensorflow/lite/nnapi_delegate_disabled.cc else diff --git a/tensorflow/lite/tools/make/build_generic_aarch64_lib.sh b/tensorflow/lite/tools/make/build_generic_aarch64_lib.sh new file mode 100755 index 00000000000..d497b94ffc0 --- /dev/null +++ b/tensorflow/lite/tools/make/build_generic_aarch64_lib.sh @@ -0,0 +1,22 @@ +#!/bin/bash -x +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR/../../../.." + +CC_PREFIX=aarch64-linux-gnu- make -j 3 -f tensorflow/lite/tools/make/Makefile TARGET=generic-aarch64 TARGET_ARCH=armv8-a diff --git a/tensorflow/lite/tools/make/targets/generic_aarch64_makefile.inc b/tensorflow/lite/tools/make/targets/generic_aarch64_makefile.inc new file mode 100644 index 00000000000..f4e4f1f9c4d --- /dev/null +++ b/tensorflow/lite/tools/make/targets/generic_aarch64_makefile.inc @@ -0,0 +1,33 @@ +# Settings for generic aarch64 boards such as Odroid C2 or Pine64. +ifeq ($(TARGET),generic-aarch64) + # The aarch64 architecture covers all 64-bit ARM chips. This arch mandates + # NEON, so FPU flags are not needed below. + TARGET_ARCH := armv8-a + TARGET_TOOLCHAIN_PREFIX := aarch64-linux-gnu- + + CXXFLAGS += \ + -march=armv8-a \ + -funsafe-math-optimizations \ + -ftree-vectorize \ + -fPIC + + CCFLAGS += \ + -march=armv8-a \ + -funsafe-math-optimizations \ + -ftree-vectorize \ + -fPIC + + LDFLAGS := \ + -Wl,--no-export-dynamic \ + -Wl,--exclude-libs,ALL \ + -Wl,--gc-sections \ + -Wl,--as-needed + + + LIBS := \ + -lstdc++ \ + -lpthread \ + -lm \ + -ldl + +endif From 89ea6622a749950149085dbe65077d3a1ec8c1ce Mon Sep 17 00:00:00 2001 From: Pariksheet Pinjari Date: Tue, 19 Mar 2019 17:05:03 +0530 Subject: [PATCH 0014/3053] Compilation warnings removed from tensor_format.h Removed compilation warnings --- tensorflow/core/util/tensor_format.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h index 643e14e0b56..8013746d017 100644 --- a/tensorflow/core/util/tensor_format.h +++ b/tensorflow/core/util/tensor_format.h @@ -120,6 +120,9 @@ inline int GetTensorSpatialDims(int num_dims, TensorFormat format) { // Note: the VECT_W is not counted as an independent spatial dim here, // since it just a component of the width dimension. return num_dims - 3; // Exclude N,C,VectDim. + default: + LOG(FATAL) << "Unknown format " << format; + return -1; // Avoid compiler warning about missing return value } } @@ -144,6 +147,9 @@ inline int GetTensorDimsFromSpatialDims(int num_spatial_dims, case FORMAT_NCHW_VECT_C: case FORMAT_NHWC_VECT_W: return num_spatial_dims + 3; // Include N,C,VectDim. + default: + LOG(FATAL) << "Unknown format " << format; + return -1; // Avoid compiler warning about missing return value } } From 82bbe77119195aa9ca5736b78bd31204e9448261 Mon Sep 17 00:00:00 2001 From: ANSHUMAN TRIPATHY Date: Mon, 1 Apr 2019 15:50:12 +0530 Subject: [PATCH 0015/3053] Compilation warnings handled --- tensorflow/lite/kernels/add_test.cc | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tensorflow/lite/kernels/add_test.cc b/tensorflow/lite/kernels/add_test.cc index 2904f4a11a9..87a916a1ab6 100644 --- a/tensorflow/lite/kernels/add_test.cc +++ b/tensorflow/lite/kernels/add_test.cc @@ -109,7 +109,7 @@ TEST(FloatAddOpModel, ActivationRELU_N1_TO_1) { TEST(FloatAddOpModel, VariousInputShapes) { std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (int i = 0; i < test_shapes.size(); ++i) { + for (uint i = 0; i < test_shapes.size(); ++i) { FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]}, {TensorType_FLOAT32, test_shapes[i]}, {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE); @@ -125,7 +125,7 @@ TEST(FloatAddOpModel, VariousInputShapes) { TEST(FloatAddOpModel, WithBroadcast) { std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (int i = 0; i < test_shapes.size(); ++i) { + for (uint i = 0; i < test_shapes.size(); ++i) { FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]}, {TensorType_FLOAT32, {}}, // always a scalar {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE); @@ -162,7 +162,7 @@ TEST(IntegerAddOpModel, ActivationRELU_N1_TO_1) { TEST(IntegerAddOpModel, VariousInputShapes) { std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (int i = 0; i < test_shapes.size(); ++i) { + for (uint i = 0; i < test_shapes.size(); ++i) { IntegerAddOpModel m({TensorType_INT32, test_shapes[i]}, {TensorType_INT32, test_shapes[i]}, {TensorType_INT32, {}}, ActivationFunctionType_NONE); @@ -177,7 +177,7 @@ TEST(IntegerAddOpModel, VariousInputShapes) { TEST(IntegerAddOpModel, WithBroadcast) { std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (int i = 0; i < test_shapes.size(); ++i) { + for (uint i = 0; i < test_shapes.size(); ++i) { IntegerAddOpModel m({TensorType_INT32, test_shapes[i]}, {TensorType_INT32, {}}, // always a scalar {TensorType_INT32, {}}, ActivationFunctionType_NONE); @@ -199,7 +199,7 @@ void QuantizedTestsNoActivation() { {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}}; std::vector> results = { {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}}; - for (int i = 0; i < inputs1.size(); ++i) { + for (uint i = 0; i < inputs1.size(); ++i) { QuantizedAddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0}, {tensor_type, {1, 2, 2, 1}, -1.0, 1.0}, {tensor_type, {}, -1.0, 1.0}, @@ -232,7 +232,7 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) { {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}}; std::vector> results = { {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}}; - for (int i = 0; i < inputs1.size(); ++i) { + for (uint i = 0; i < inputs1.size(); ++i) { QuantizedAddOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMin, kMax}, {TensorType_INT16, {1, 2, 2, 1}, kMin, kMax}, {TensorType_INT16, {}, kMin, kMax}, @@ -256,7 +256,7 @@ void QuantizedTestsActivationRELU_N1_TO_1() { {0.6, 0.4, -0.8, 0.5}}; std::vector> results = {{-0.2, 0.6, 1.0, -0.1}, {-0.2, 0.6, -0.1, 0.8}}; - for (int i = 0; i < inputs1.size(); ++i) { + for (uint i = 0; i < inputs1.size(); ++i) { QuantizedAddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0}, {tensor_type, {1, 2, 2, 1}, -1.0, 1.0}, {tensor_type, {}, -1.0, 1.0}, @@ -284,7 +284,7 @@ void QuantizedVariousInputShapes() { float kQuantizedTolerance = GetTolerance(-3.0, 3.0); std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (int i = 0; i < test_shapes.size(); ++i) { + for (uint i = 0; i < test_shapes.size(); ++i) { QuantizedAddOpModel m({tensor_type, test_shapes[i], -3.0, 3.0}, {tensor_type, test_shapes[i], -3.0, 3.0}, {tensor_type, {}, -3.0, 3.0}, @@ -314,7 +314,7 @@ void QuantizedWithScalarBroadcast() { float kQuantizedTolerance = GetTolerance(-3.f, 3.f); std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (int i = 0; i < test_shapes.size(); ++i) { + for (uint i = 0; i < test_shapes.size(); ++i) { QuantizedAddOpModel model_fixture( {tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f}, ActivationFunctionType_NONE); @@ -330,7 +330,7 @@ void QuantizedWithScalarBroadcast() { << "With shape number " << i; } // Re-run with exchanged inputs. - for (int i = 0; i < test_shapes.size(); ++i) { + for (uint i = 0; i < test_shapes.size(); ++i) { QuantizedAddOpModel model_fixture( {tensor_type, {}, -3.f, 3.f}, {tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f}, ActivationFunctionType_NONE); @@ -374,7 +374,7 @@ void QuantizedWithMixedBroadcast() { 1.0f, -0.7f, 0.9f, 1.2f, -1.7f, 1.7f, -1.2f, 1.6f, -1.3f}, {-0.1f, 2.5f, 1.2f, 0.8f, 0.4f, -1.5f, 1.7f, 3.0f, -0.6f, 1.0f, 1.6f, -1.3f}}; - for (int i = 0; i < test_shapes.size(); ++i) { + for (uint i = 0; i < test_shapes.size(); ++i) { QuantizedAddOpModel model_fixture({tensor_type, base_shape, -3.f, 3.f}, {tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f}, @@ -391,7 +391,7 @@ void QuantizedWithMixedBroadcast() { << "With shape number " << i; } // Re-run with exchanged inputs. - for (int i = 0; i < test_shapes.size(); ++i) { + for (uint i = 0; i < test_shapes.size(); ++i) { QuantizedAddOpModel model_fixture({tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, base_shape, -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f}, From 754cda01bb93b1f0ca01b53cefb81511c811767f Mon Sep 17 00:00:00 2001 From: frreiss Date: Mon, 1 Apr 2019 12:59:56 -0700 Subject: [PATCH 0016/3053] context.graph_mode() ==> ops.Graph().as_default() --- .../python/kernel_tests/check_ops_test.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py index 053fa222884..d01655f45e5 100644 --- a/tensorflow/python/kernel_tests/check_ops_test.py +++ b/tensorflow/python/kernel_tests/check_ops_test.py @@ -369,7 +369,7 @@ class AssertNoneEqualTest(test.TestCase): assert x is None def test_static_check_in_graph_mode(self): - with context.graph_mode(): + with ops.Graph().as_default(): with self.assertRaisesRegexp(errors.InvalidArgumentError, "Custom error message"): check_ops.assert_none_equal(1, 1, message="Custom error message") @@ -581,7 +581,7 @@ class AssertLessTest(test.TestCase): assert x is None def test_static_check_in_graph_mode(self): - with context.graph_mode(): + with ops.Graph().as_default(): with self.assertRaisesRegexp(errors.InvalidArgumentError, "Custom error message"): check_ops.assert_less(1, 1, message="Custom error message") @@ -651,7 +651,7 @@ class AssertLessEqualTest(test.TestCase): self.evaluate(out) def test_static_check_in_graph_mode(self): - with context.graph_mode(): + with ops.Graph().as_default(): with self.assertRaisesRegexp(errors.InvalidArgumentError, "Custom error message"): check_ops.assert_less_equal(1, 0, message="Custom error message") @@ -720,7 +720,7 @@ class AssertGreaterTest(test.TestCase): self.evaluate(out) def test_static_check_in_graph_mode(self): - with context.graph_mode(): + with ops.Graph().as_default(): with self.assertRaisesRegexp(errors.InvalidArgumentError, "Custom error message"): check_ops.assert_greater(0, 1, message="Custom error message") @@ -792,7 +792,7 @@ class AssertGreaterEqualTest(test.TestCase): self.evaluate(out) def test_static_check_in_graph_mode(self): - with context.graph_mode(): + with ops.Graph().as_default(): with self.assertRaisesRegexp(errors.InvalidArgumentError, "Custom error message"): check_ops.assert_greater_equal(0, 1, message="Custom error message") @@ -839,7 +839,7 @@ class AssertNegativeTest(test.TestCase): self.evaluate(out) def test_static_check_in_graph_mode(self): - with context.graph_mode(): + with ops.Graph().as_default(): with self.assertRaisesRegexp(errors.InvalidArgumentError, "Custom error message"): check_ops.assert_negative(1, message="Custom error message") @@ -886,7 +886,7 @@ class AssertPositiveTest(test.TestCase): self.evaluate(out) def test_static_check_in_graph_mode(self): - with context.graph_mode(): + with ops.Graph().as_default(): with self.assertRaisesRegexp(errors.InvalidArgumentError, "Custom error message"): check_ops.assert_positive(-1, message="Custom error message") @@ -1433,7 +1433,7 @@ class AssertNonNegativeTest(test.TestCase): self.evaluate(out) def test_static_check_in_graph_mode(self): - with context.graph_mode(): + with ops.Graph().as_default(): with self.assertRaisesRegexp(errors.InvalidArgumentError, "Custom error message"): check_ops.assert_non_negative(-1, message="Custom error message") @@ -1470,7 +1470,7 @@ class AssertNonPositiveTest(test.TestCase): self.evaluate(out) def test_static_check_in_graph_mode(self): - with context.graph_mode(): + with ops.Graph().as_default(): with self.assertRaisesRegexp(errors.InvalidArgumentError, "Custom error message"): check_ops.assert_non_positive(1, message="Custom error message") From f757bf9048574d2a0d4a5cba0b063653bedb19ec Mon Sep 17 00:00:00 2001 From: ANSHUMAN TRIPATHY Date: Tue, 2 Apr 2019 12:03:20 +0530 Subject: [PATCH 0017/3053] Updated to keep the index as int --- tensorflow/lite/kernels/add_test.cc | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tensorflow/lite/kernels/add_test.cc b/tensorflow/lite/kernels/add_test.cc index 87a916a1ab6..70f93b9780b 100644 --- a/tensorflow/lite/kernels/add_test.cc +++ b/tensorflow/lite/kernels/add_test.cc @@ -109,7 +109,7 @@ TEST(FloatAddOpModel, ActivationRELU_N1_TO_1) { TEST(FloatAddOpModel, VariousInputShapes) { std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (uint i = 0; i < test_shapes.size(); ++i) { + for (int i = 0; i < static_cast(test_shapes.size()); ++i) { FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]}, {TensorType_FLOAT32, test_shapes[i]}, {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE); @@ -125,7 +125,7 @@ TEST(FloatAddOpModel, VariousInputShapes) { TEST(FloatAddOpModel, WithBroadcast) { std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (uint i = 0; i < test_shapes.size(); ++i) { + for (int i = 0; i < static_cast(test_shapes.size()); ++i) { FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]}, {TensorType_FLOAT32, {}}, // always a scalar {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE); @@ -162,7 +162,7 @@ TEST(IntegerAddOpModel, ActivationRELU_N1_TO_1) { TEST(IntegerAddOpModel, VariousInputShapes) { std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (uint i = 0; i < test_shapes.size(); ++i) { + for (int i = 0; i < static_cast(test_shapes.size()); ++i) { IntegerAddOpModel m({TensorType_INT32, test_shapes[i]}, {TensorType_INT32, test_shapes[i]}, {TensorType_INT32, {}}, ActivationFunctionType_NONE); @@ -177,7 +177,7 @@ TEST(IntegerAddOpModel, VariousInputShapes) { TEST(IntegerAddOpModel, WithBroadcast) { std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (uint i = 0; i < test_shapes.size(); ++i) { + for (int i = 0; i < static_cast(test_shapes.size()); ++i) { IntegerAddOpModel m({TensorType_INT32, test_shapes[i]}, {TensorType_INT32, {}}, // always a scalar {TensorType_INT32, {}}, ActivationFunctionType_NONE); @@ -199,7 +199,7 @@ void QuantizedTestsNoActivation() { {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}}; std::vector> results = { {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}}; - for (uint i = 0; i < inputs1.size(); ++i) { + for (int i = 0; i < static_cast(inputs1.size()); ++i) { QuantizedAddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0}, {tensor_type, {1, 2, 2, 1}, -1.0, 1.0}, {tensor_type, {}, -1.0, 1.0}, @@ -232,7 +232,7 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) { {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}}; std::vector> results = { {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}}; - for (uint i = 0; i < inputs1.size(); ++i) { + for (int i = 0; i < static_cast(inputs1.size()); ++i) { QuantizedAddOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMin, kMax}, {TensorType_INT16, {1, 2, 2, 1}, kMin, kMax}, {TensorType_INT16, {}, kMin, kMax}, @@ -256,7 +256,7 @@ void QuantizedTestsActivationRELU_N1_TO_1() { {0.6, 0.4, -0.8, 0.5}}; std::vector> results = {{-0.2, 0.6, 1.0, -0.1}, {-0.2, 0.6, -0.1, 0.8}}; - for (uint i = 0; i < inputs1.size(); ++i) { + for (int i = 0; i < static_cast(inputs1.size()); ++i) { QuantizedAddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0}, {tensor_type, {1, 2, 2, 1}, -1.0, 1.0}, {tensor_type, {}, -1.0, 1.0}, @@ -284,7 +284,7 @@ void QuantizedVariousInputShapes() { float kQuantizedTolerance = GetTolerance(-3.0, 3.0); std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (uint i = 0; i < test_shapes.size(); ++i) { + for (int i = 0; i < static_cast(test_shapes.size()); ++i) { QuantizedAddOpModel m({tensor_type, test_shapes[i], -3.0, 3.0}, {tensor_type, test_shapes[i], -3.0, 3.0}, {tensor_type, {}, -3.0, 3.0}, @@ -314,7 +314,7 @@ void QuantizedWithScalarBroadcast() { float kQuantizedTolerance = GetTolerance(-3.f, 3.f); std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (uint i = 0; i < test_shapes.size(); ++i) { + for (int i = 0; i < static_cast(test_shapes.size()); ++i) { QuantizedAddOpModel model_fixture( {tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f}, ActivationFunctionType_NONE); @@ -330,7 +330,7 @@ void QuantizedWithScalarBroadcast() { << "With shape number " << i; } // Re-run with exchanged inputs. - for (uint i = 0; i < test_shapes.size(); ++i) { + for (int i = 0; i < static_cast(test_shapes.size()); ++i) { QuantizedAddOpModel model_fixture( {tensor_type, {}, -3.f, 3.f}, {tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f}, ActivationFunctionType_NONE); @@ -374,7 +374,7 @@ void QuantizedWithMixedBroadcast() { 1.0f, -0.7f, 0.9f, 1.2f, -1.7f, 1.7f, -1.2f, 1.6f, -1.3f}, {-0.1f, 2.5f, 1.2f, 0.8f, 0.4f, -1.5f, 1.7f, 3.0f, -0.6f, 1.0f, 1.6f, -1.3f}}; - for (uint i = 0; i < test_shapes.size(); ++i) { + for (int i = 0; i < static_cast(test_shapes.size()); ++i) { QuantizedAddOpModel model_fixture({tensor_type, base_shape, -3.f, 3.f}, {tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f}, @@ -391,7 +391,7 @@ void QuantizedWithMixedBroadcast() { << "With shape number " << i; } // Re-run with exchanged inputs. - for (uint i = 0; i < test_shapes.size(); ++i) { + for (int i = 0; i < static_cast(test_shapes.size()); ++i) { QuantizedAddOpModel model_fixture({tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, base_shape, -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f}, From e435613e3d0a89ab822d2ffba3578679333c3523 Mon Sep 17 00:00:00 2001 From: ANSHUMAN TRIPATHY Date: Wed, 3 Apr 2019 08:41:49 +0530 Subject: [PATCH 0018/3053] [1] Review comments handled --- tensorflow/lite/kernels/add_test.cc | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/tensorflow/lite/kernels/add_test.cc b/tensorflow/lite/kernels/add_test.cc index 70f93b9780b..42a6c5dfcdb 100644 --- a/tensorflow/lite/kernels/add_test.cc +++ b/tensorflow/lite/kernels/add_test.cc @@ -109,7 +109,7 @@ TEST(FloatAddOpModel, ActivationRELU_N1_TO_1) { TEST(FloatAddOpModel, VariousInputShapes) { std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (int i = 0; i < static_cast(test_shapes.size()); ++i) { + for (size_t i = 0; i < test_shapes.size(); ++i) { FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]}, {TensorType_FLOAT32, test_shapes[i]}, {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE); @@ -125,7 +125,7 @@ TEST(FloatAddOpModel, VariousInputShapes) { TEST(FloatAddOpModel, WithBroadcast) { std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (int i = 0; i < static_cast(test_shapes.size()); ++i) { + for (size_t i = 0; i < test_shapes.size(); ++i) { FloatAddOpModel m({TensorType_FLOAT32, test_shapes[i]}, {TensorType_FLOAT32, {}}, // always a scalar {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE); @@ -162,7 +162,7 @@ TEST(IntegerAddOpModel, ActivationRELU_N1_TO_1) { TEST(IntegerAddOpModel, VariousInputShapes) { std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (int i = 0; i < static_cast(test_shapes.size()); ++i) { + for (size_t i = 0; i < test_shapes.size(); ++i) { IntegerAddOpModel m({TensorType_INT32, test_shapes[i]}, {TensorType_INT32, test_shapes[i]}, {TensorType_INT32, {}}, ActivationFunctionType_NONE); @@ -177,7 +177,7 @@ TEST(IntegerAddOpModel, VariousInputShapes) { TEST(IntegerAddOpModel, WithBroadcast) { std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (int i = 0; i < static_cast(test_shapes.size()); ++i) { + for (size_t i = 0; i < test_shapes.size(); ++i) { IntegerAddOpModel m({TensorType_INT32, test_shapes[i]}, {TensorType_INT32, {}}, // always a scalar {TensorType_INT32, {}}, ActivationFunctionType_NONE); @@ -199,7 +199,7 @@ void QuantizedTestsNoActivation() { {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}}; std::vector> results = { {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}}; - for (int i = 0; i < static_cast(inputs1.size()); ++i) { + for (size_t i = 0; i < inputs1.size(); ++i) { QuantizedAddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0}, {tensor_type, {1, 2, 2, 1}, -1.0, 1.0}, {tensor_type, {}, -1.0, 1.0}, @@ -232,7 +232,7 @@ TEST(QuantizedAddOpModel, QuantizedTestsNoActivationInt16) { {0.6, 0.4, 0.3, 0.1}, {0.6, 0.4, 0.5, -0.8}, {0.6, 0.4, -0.8, 0.5}}; std::vector> results = { {0.7, 0.6, 0.6, 0.5}, {-0.2, 0.6, 0.9, -0.1}, {-0.2, 0.6, -0.1, 0.8}}; - for (int i = 0; i < static_cast(inputs1.size()); ++i) { + for (size_t i = 0; i < inputs1.size(); ++i) { QuantizedAddOpModel m({TensorType_INT16, {1, 2, 2, 1}, kMin, kMax}, {TensorType_INT16, {1, 2, 2, 1}, kMin, kMax}, {TensorType_INT16, {}, kMin, kMax}, @@ -256,7 +256,7 @@ void QuantizedTestsActivationRELU_N1_TO_1() { {0.6, 0.4, -0.8, 0.5}}; std::vector> results = {{-0.2, 0.6, 1.0, -0.1}, {-0.2, 0.6, -0.1, 0.8}}; - for (int i = 0; i < static_cast(inputs1.size()); ++i) { + for (size_t i = 0; i < inputs1.size(); ++i) { QuantizedAddOpModel m({tensor_type, {1, 2, 2, 1}, -1.0, 1.0}, {tensor_type, {1, 2, 2, 1}, -1.0, 1.0}, {tensor_type, {}, -1.0, 1.0}, @@ -284,7 +284,7 @@ void QuantizedVariousInputShapes() { float kQuantizedTolerance = GetTolerance(-3.0, 3.0); std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (int i = 0; i < static_cast(test_shapes.size()); ++i) { + for (size_t i = 0; i < test_shapes.size(); ++i) { QuantizedAddOpModel m({tensor_type, test_shapes[i], -3.0, 3.0}, {tensor_type, test_shapes[i], -3.0, 3.0}, {tensor_type, {}, -3.0, 3.0}, @@ -314,7 +314,7 @@ void QuantizedWithScalarBroadcast() { float kQuantizedTolerance = GetTolerance(-3.f, 3.f); std::vector> test_shapes = { {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}}; - for (int i = 0; i < static_cast(test_shapes.size()); ++i) { + for (size_t i = 0; i < test_shapes.size(); ++i) { QuantizedAddOpModel model_fixture( {tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f}, ActivationFunctionType_NONE); @@ -330,7 +330,7 @@ void QuantizedWithScalarBroadcast() { << "With shape number " << i; } // Re-run with exchanged inputs. - for (int i = 0; i < static_cast(test_shapes.size()); ++i) { + for (size_t i = 0; i < test_shapes.size(); ++i) { QuantizedAddOpModel model_fixture( {tensor_type, {}, -3.f, 3.f}, {tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f}, ActivationFunctionType_NONE); @@ -374,7 +374,7 @@ void QuantizedWithMixedBroadcast() { 1.0f, -0.7f, 0.9f, 1.2f, -1.7f, 1.7f, -1.2f, 1.6f, -1.3f}, {-0.1f, 2.5f, 1.2f, 0.8f, 0.4f, -1.5f, 1.7f, 3.0f, -0.6f, 1.0f, 1.6f, -1.3f}}; - for (int i = 0; i < static_cast(test_shapes.size()); ++i) { + for (size_t i = 0; i < test_shapes.size(); ++i) { QuantizedAddOpModel model_fixture({tensor_type, base_shape, -3.f, 3.f}, {tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f}, @@ -391,7 +391,7 @@ void QuantizedWithMixedBroadcast() { << "With shape number " << i; } // Re-run with exchanged inputs. - for (int i = 0; i < static_cast(test_shapes.size()); ++i) { + for (size_t i = 0; i < test_shapes.size(); ++i) { QuantizedAddOpModel model_fixture({tensor_type, test_shapes[i], -3.f, 3.f}, {tensor_type, base_shape, -3.f, 3.f}, {tensor_type, {}, -3.f, 3.f}, From 5ee6579f2528fe275663b8d7e7dcabb91d227306 Mon Sep 17 00:00:00 2001 From: "Albert Z. Guo" Date: Thu, 4 Apr 2019 23:00:54 -0500 Subject: [PATCH 0019/3053] Update word2vec_basic.py refine comments --- tensorflow/examples/tutorials/word2vec/word2vec_basic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py index b503709ee2a..fe5c434e907 100644 --- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py +++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py @@ -100,7 +100,7 @@ def word2vec_basic(log_dir): # This is the original text but words are replaced by their codes # count - map of words(strings) to count of occurrences # dictionary - map of words(strings) to their codes(integers) - # reverse_dictionary - maps codes(integers) to words(strings) + # reverse_dictionary - map of codes(integers) to words(strings) data, count, unused_dictionary, reverse_dictionary = build_dataset( vocabulary, vocabulary_size) del vocabulary # Hint to reduce memory. @@ -186,8 +186,9 @@ def word2vec_basic(log_dir): # Compute the average NCE loss for the batch. # tf.nce_loss automatically draws a new sample of the negative labels each # time we evaluate the loss. - # Explanation of the meaning of NCE loss: + # Explanation of the meaning of NCE loss and why choosing NCE over tf.nn.sampled_softmax_loss: # http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/ + # http://papers.nips.cc/paper/5165-learning-word-embeddings-efficiently-with-noise-contrastive-estimation.pdf with tf.name_scope('loss'): loss = tf.reduce_mean( tf.nn.nce_loss( From 6538102fe8e55f88c3e57eda21916d8dc39d6e97 Mon Sep 17 00:00:00 2001 From: MichaelKonobeev Date: Sat, 6 Apr 2019 20:14:29 +0300 Subject: [PATCH 0020/3053] Improve formatting --- tensorflow/python/ops/nn_grad.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py index abdb4b8d2c1..19f631c4965 100644 --- a/tensorflow/python/ops/nn_grad.py +++ b/tensorflow/python/ops/nn_grad.py @@ -517,11 +517,12 @@ def _SparseSoftmaxCrossEntropyWithLogitsGrad(op, grad_loss, grad_grad): if grad_grad is not None and not _IsZero(grad_grad): softmax = nn_ops.softmax(logits) - grad += ((grad_grad - array_ops.squeeze( - math_ops.matmul(array_ops.expand_dims(grad_grad, 1), - array_ops.expand_dims(softmax, 2)), - axis=1)) * - softmax) + grad += ((grad_grad + - array_ops.squeeze( + math_ops.matmul(array_ops.expand_dims(grad_grad, 1), + array_ops.expand_dims(softmax, 2)), + axis=1) + ) * softmax) return grad, None From e3864556890854e16c5e914dda62535855524104 Mon Sep 17 00:00:00 2001 From: TheMindVirus Date: Wed, 15 May 2019 02:27:06 +0100 Subject: [PATCH 0021/3053] BeagleBone Black Tensorflow Lite build scripts --- tensorflow/lite/tools/make/build_bbb_lib.sh | 22 ++++++++++++ .../lite/tools/make/targets/bbb_makefile.inc | 35 +++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100755 tensorflow/lite/tools/make/build_bbb_lib.sh create mode 100644 tensorflow/lite/tools/make/targets/bbb_makefile.inc diff --git a/tensorflow/lite/tools/make/build_bbb_lib.sh b/tensorflow/lite/tools/make/build_bbb_lib.sh new file mode 100755 index 00000000000..a195c407793 --- /dev/null +++ b/tensorflow/lite/tools/make/build_bbb_lib.sh @@ -0,0 +1,22 @@ +#!/bin/bash -x +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR/../../../.." + +CC_PREFIX=arm-linux-gnueabihf- make -j 3 -f tensorflow/lite/tools/make/Makefile TARGET=bbb TARGET_ARCH=armv7l diff --git a/tensorflow/lite/tools/make/targets/bbb_makefile.inc b/tensorflow/lite/tools/make/targets/bbb_makefile.inc new file mode 100644 index 00000000000..dfbdd2f0c72 --- /dev/null +++ b/tensorflow/lite/tools/make/targets/bbb_makefile.inc @@ -0,0 +1,35 @@ +# Settings for BeagleBone Black. +ifeq ($(TARGET),bbb) + TARGET_ARCH := armv7l + TARGET_TOOLCHAIN_PREFIX := arm-linux-gnueabihf- + + ifeq ($(TARGET_ARCH), armv7l) + CXXFLAGS += \ + -march=armv7-a \ + -mfpu=neon \ + -funsafe-math-optimizations \ + -ftree-vectorize \ + -fPIC + + CFLAGS += \ + -march=armv7-a \ + -mfpu=neon \ + -funsafe-math-optimizations \ + -ftree-vectorize \ + -fPIC + + LDFLAGS := \ + -Wl,--no-export-dynamic \ + -Wl,--exclude-libs,ALL \ + -Wl,--gc-sections \ + -Wl,--as-needed + endif + + LIBS := \ + -lstdc++ \ + -lpthread \ + -lm \ + -ldl \ + -lrt + +endif From 3ccfda11544d9cf710efcae4a4599f30cce14fe6 Mon Sep 17 00:00:00 2001 From: frreiss Date: Wed, 15 May 2019 14:59:38 -0700 Subject: [PATCH 0022/3053] Fix linter warning --- .../saved_model/integration_tests/integration_scripts.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow/examples/saved_model/integration_tests/integration_scripts.py b/tensorflow/examples/saved_model/integration_tests/integration_scripts.py index 0db91facd65..8ac44131708 100644 --- a/tensorflow/examples/saved_model/integration_tests/integration_scripts.py +++ b/tensorflow/examples/saved_model/integration_tests/integration_scripts.py @@ -61,5 +61,4 @@ def MaybeRunScriptInstead(): # Append current path to import path and execute `SCRIPT_NAME` main. sys.path.extend([os.path.dirname(__file__)]) module_name = os.environ["SCRIPT_NAME"] - retval = app.run(importlib.import_module(module_name).main) - sys.exit(retval) + app.run(importlib.import_module(module_name).main) From e4378df8845089ee550b500d620668a70e37201e Mon Sep 17 00:00:00 2001 From: frreiss Date: Wed, 15 May 2019 15:33:42 -0700 Subject: [PATCH 0023/3053] Fix second linter warning --- tensorflow/python/platform/googletest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/platform/googletest.py b/tensorflow/python/platform/googletest.py index f146d751744..cabd85d8e79 100644 --- a/tensorflow/python/platform/googletest.py +++ b/tensorflow/python/platform/googletest.py @@ -24,9 +24,9 @@ import sys import tempfile # go/tf-wildcard-import -# pylint: disable=wildcard-import +# pylint: disable=wildcard-import,redefined-builtin from absl.testing.absltest import * -# pylint: enable=wildcard-import +# pylint: enable=wildcard-import,redefined-builtin from tensorflow.python.framework import errors from tensorflow.python.lib.io import file_io From 155bed6f2fcf1637e9b6063de8ff601156de6049 Mon Sep 17 00:00:00 2001 From: MichaelKonobeev Date: Wed, 29 May 2019 22:19:51 +0300 Subject: [PATCH 0024/3053] Support BatchMatMulV2 --- tensorflow/python/kernel_tests/sparse_xent_op_test.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/kernel_tests/sparse_xent_op_test.py b/tensorflow/python/kernel_tests/sparse_xent_op_test.py index be3347a5e1b..e6693f96f86 100644 --- a/tensorflow/python/kernel_tests/sparse_xent_op_test.py +++ b/tensorflow/python/kernel_tests/sparse_xent_op_test.py @@ -24,6 +24,7 @@ import time import numpy as np from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.compat import compat from tensorflow.python.client import session from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes @@ -204,6 +205,7 @@ class SparseXentTest(test.TestCase): op.op_def.name for op in sess.graph.get_operations() if op.op_def ] self.assertNotIn("BatchMatMul", op_names) + self.assertNotIn("BatchMatMulV2", op_names) print("cross entropy gradient err = ", err) self.assertLess(err, 5e-8) @@ -229,7 +231,10 @@ class SparseXentTest(test.TestCase): op_names = [ op.op_def.name for op in sess.graph.get_operations() if op.op_def ] - self.assertIn("BatchMatMul", op_names) + if compat.forward_compatible(2019, 4, 25): + self.assertIn("BatchMatMulV2", op_names) + else: + self.assertIn("BatchMatMul", op_names) print("cross entropy hessian err = ", err) self.assertLess(err, 5e-8) From 21eeaf272dae58f82ca0198de6a0bf4559f0c48c Mon Sep 17 00:00:00 2001 From: MichaelKonobeev Date: Wed, 29 May 2019 22:24:09 +0300 Subject: [PATCH 0025/3053] CrossEntropy testGradient BatchMatMulV2 compat --- tensorflow/python/kernel_tests/xent_op_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py index c3c7f867a1e..8fe974a8cea 100644 --- a/tensorflow/python/kernel_tests/xent_op_test.py +++ b/tensorflow/python/kernel_tests/xent_op_test.py @@ -202,6 +202,7 @@ class XentTest(test.TestCase): op.op_def.name for op in sess.graph.get_operations() if op.op_def ] self.assertNotIn("BatchMatMul", op_names) + self.assertNotIn("BatchMatMulV2", op_names) print("cross entropy gradient err = ", err) self.assertLess(err, 5e-8) From 2758d367e75e645c5c73c12bd98fdbdf25c3dbb2 Mon Sep 17 00:00:00 2001 From: frreiss Date: Wed, 29 May 2019 15:35:26 -0700 Subject: [PATCH 0026/3053] Address review comments/linter warnings --- .../python/kernel_tests/bijectors/affine_test.py | 2 +- .../python/kernel_tests/bijectors/reshape_test.py | 11 ++++++++--- .../python/kernel_tests/bijectors/softplus_test.py | 2 +- .../distributions/python/kernel_tests/cauchy_test.py | 4 ++-- .../python/kernel_tests/deterministic_test.py | 8 ++++---- .../python/kernel_tests/half_normal_test.py | 5 +++-- .../python/kernel_tests/inverse_gamma_test.py | 11 ++++++----- .../kernel_tests/quantized_distribution_test.py | 4 ++-- .../python/kernel_tests/relaxed_bernoulli_test.py | 4 ++-- .../contrib/metrics/python/ops/metric_ops_test.py | 2 +- 10 files changed, 30 insertions(+), 23 deletions(-) diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py index cfb342049f2..8b61d4be63c 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/affine_test.py @@ -464,7 +464,7 @@ class AffineBijectorTest(test.TestCase): mu = [1., -1] with self.assertRaisesRegexp(errors.InvalidArgumentError, "diagonal part must be non-zero"): - bijector = Affine( + _ = Affine( shift=mu, # Has zero on the diagonal. scale_diag=[0., 1], diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py index 160d5794efc..4d9bbec770f 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py @@ -154,11 +154,16 @@ class _ReshapeBijectorTest(object): def _testInvalidDimensionsStatic(self, expected_error_message): """Version of _testInvalidDimensionsOpError for errors detected statically - at graph construction time.""" - shape_in, shape_out, feed_dict = self.build_shapes([2, 3], [1, 2, -2,]) + at graph construction time. + + Args: + expected_error_message: String that should be present in the error + message that `Reshape` raises for invalid shapes. + """ + shape_in, shape_out, _ = self.build_shapes([2, 3], [1, 2, -2,]) with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError, expected_error_message): - bijector = Reshape( + _ = Reshape( event_shape_out=shape_out, event_shape_in=shape_in, validate_args=True) diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py index d8484ba22fd..2e7ab3ecfd2 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/softplus_test.py @@ -46,7 +46,7 @@ class SoftplusBijectorTest(test.TestCase): with self.cached_session(): with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError, "must be non-zero"): - bijector = Softplus(hinge_softness=0., validate_args=True) + _ = Softplus(hinge_softness=0., validate_args=True) # Error detected statically; don't need to run op. def testBijectorForwardInverseEventDimsZero(self): diff --git a/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py b/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py index 353836fb75c..f5d6944d166 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/cauchy_test.py @@ -403,8 +403,8 @@ class CauchyTest(test.TestCase): with self.cached_session(): with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError, "Condition x > 0 did not hold"): - cauchy = cauchy_lib.Cauchy(loc=[1.], scale=[-5.], validate_args=True) - # Error detected statically; no need for cauchy.mode().eval() + _ = cauchy_lib.Cauchy(loc=[1.], scale=[-5.], validate_args=True) + # Error detected statically; no need for _.mode().eval() def testCauchyShape(self): with self.cached_session(): diff --git a/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py b/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py index 568ee8f20ff..e81ff7cc29c 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/deterministic_test.py @@ -43,9 +43,9 @@ class DeterministicTest(test.TestCase): loc = rng.rand(2, 3, 4).astype(np.float32) with self.assertRaisesRegexp(errors.InvalidArgumentError, "Condition x >= 0"): - deterministic = deterministic_lib.Deterministic( + _ = deterministic_lib.Deterministic( loc, atol=-1, validate_args=True) - # Error detected statically; no need for deterministic.prob(0.).eval() + # Error detected statically; no need for _.prob(0.).eval() def testProbWithNoBatchDimsIntegerType(self): deterministic = deterministic_lib.Deterministic(0) @@ -198,9 +198,9 @@ class VectorDeterministicTest(test.TestCase): loc = rng.rand(2, 3, 4).astype(np.float32) with self.assertRaisesRegexp(errors.InvalidArgumentError, "Condition x >= 0"): - deterministic = deterministic_lib.VectorDeterministic( + _ = deterministic_lib.VectorDeterministic( loc, atol=-1, validate_args=True) - # Error detected statically; no need for deterministic.prob(loc).eval() + # Error detected statically; no need for _.prob(loc).eval() def testInvalidXRaises(self): loc = rng.rand(2, 3, 4).astype(np.float32) diff --git a/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py b/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py index a1b8a9e181f..3ed96e6fdb8 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/half_normal_test.py @@ -42,6 +42,7 @@ def try_import(name): # pylint: disable=invalid-name tf_logging.warning("Could not import %s: %s" % (name, str(e))) return module + stats = try_import("scipy.stats") @@ -291,8 +292,8 @@ class HalfNormalTest(test.TestCase): with self.cached_session(): with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError, "Condition x > 0 did not hold"): - halfnorm = hn_lib.HalfNormal(scale=[-5.], validate_args=True, name="G") - # Error detected statically; no need for halfnorm.mean().eval() + _ = hn_lib.HalfNormal(scale=[-5.], validate_args=True, name="G") + # Error detected statically; no need for _.mean().eval() def testHalfNormalShape(self): with self.cached_session(): diff --git a/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py b/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py index 8ba791cad7d..7c46674cc04 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/inverse_gamma_test.py @@ -250,7 +250,8 @@ class InverseGammaTest(test.TestCase): fails += 0 if self._kstest(a, b, s) else 1 self.assertLess(fails, trials * 0.03) - def _kstest(self, alpha, beta, samples): + @staticmethod + def _kstest(alpha, beta, samples): # Uses the Kolmogorov-Smirnov test for goodness of fit. ks, _ = stats.kstest(samples, stats.invgamma(alpha, scale=beta).cdf) # Return True when the test passes. @@ -298,16 +299,16 @@ class InverseGammaTest(test.TestCase): beta_v = constant_op.constant(1.0, name="beta") with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError, "alpha"): - inv_gamma = inverse_gamma.InverseGamma( + _ = inverse_gamma.InverseGamma( concentration=alpha_v, rate=beta_v, validate_args=True) - # Error detected statically; no need for inv_gamma.mean().eval() + # Error detected statically; no need for _.mean().eval() alpha_v = constant_op.constant(1.0, name="alpha") beta_v = constant_op.constant(0.0, name="beta") with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError, "beta"): - inv_gamma = inverse_gamma.InverseGamma( + _ = inverse_gamma.InverseGamma( concentration=alpha_v, rate=beta_v, validate_args=True) - # Error detected statically; no need for inv_gamma.mean().eval() + # Error detected statically; no need for _.mean().eval() def testInverseGammaWithSoftplusConcentrationRate(self): with self.cached_session(): diff --git a/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py b/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py index 88773fb7aa0..82257e136ba 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/quantized_distribution_test.py @@ -364,12 +364,12 @@ class QuantizedDistributionTest(test.TestCase): with self.cached_session(): with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError, "must be strictly less"): - qdist = distributions.QuantizedDistribution( + _ = distributions.QuantizedDistribution( distribution=distributions.Normal(loc=0., scale=1.), low=1., # not strictly less than high. high=1., validate_args=True) - # Error detected statically; no need for qdist.sample().eval() + # Error detected statically; no need for _.sample().eval() def testCutoffsMustBeIntegerValuedIfValidateArgsTrue(self): with self.cached_session(): diff --git a/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py b/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py index 85ee0095716..b709ce84125 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/relaxed_bernoulli_test.py @@ -96,8 +96,8 @@ class RelaxedBernoulliTest(test.TestCase): p = constant_op.constant([0.1, 0.4]) with self.assertRaisesWithPredicateMatch(errors_impl.InvalidArgumentError, "x > 0 did not hold"): - dist = relaxed_bernoulli.RelaxedBernoulli(temperature, probs=p, - validate_args=True) + _ = relaxed_bernoulli.RelaxedBernoulli(temperature, probs=p, + validate_args=True) # Error detected statically; no need to run the op. def testDtype(self): diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py index e7a14e2514e..a8e176e6475 100644 --- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py +++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py @@ -1736,7 +1736,7 @@ class StreamingAUCTest(test.TestCase): labels = constant_op.constant([0, 1, 1, 0], shape=(1, 4)) with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, r"predictions must be in \[0, 1\]"): - _, update_op = metrics.streaming_auc(predictions, labels) + _, _ = metrics.streaming_auc(predictions, labels) # Error detected statically; no need to run the op. def testAllCorrect(self): From b9ba802746049cfd11adacc5cadbf0461b5d3f75 Mon Sep 17 00:00:00 2001 From: Imran Salam Date: Sun, 2 Jun 2019 16:37:41 +0500 Subject: [PATCH 0027/3053] [TF 2.0 API Docs] tf.image.adjust_brightness Added a usage example in image.adjust_brightness in image_ops_impl.py --- tensorflow/python/ops/image_ops_impl.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index f2230a1f2a2..04c6c5743fb 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -1590,6 +1590,13 @@ def adjust_brightness(image, delta): Returns: A brightness-adjusted tensor of the same shape and type as `image`. + + Usage Example: + ```python + >> import tensorflow as tf + >> x = tf.random.normal(shape=(256, 256, 3)) + >> tf.image.adjust_brightness(x, delta=0.1) + ``` """ with ops.name_scope(None, 'adjust_brightness', [image, delta]) as name: image = ops.convert_to_tensor(image, name='image') From 78bd76f0376e0b6b02d6b8030f887b49b60551c5 Mon Sep 17 00:00:00 2001 From: Imran Salam Date: Mon, 3 Jun 2019 00:23:57 +0500 Subject: [PATCH 0028/3053] [TF 2.0 API Docs] tf.image.adjust_jpeg_quality Updated adjust_jpeg_quality by adding a usage example in the docstring in image_ops_impl.py. Added raises that were happening but not occurring in the docstring. The issue has been raised and is provided in this link https://github.com/tensorflow/tensorflow/issues/29330 --- tensorflow/python/ops/image_ops_impl.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index f2230a1f2a2..ead9d169d11 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -1965,6 +1965,16 @@ def adjust_jpeg_quality(image, jpeg_quality, name=None): Returns: Adjusted image(s), same shape and DType as `image`. + + Usage Example: + ```python + >> import tensorflow as tf + >> x = tf.random.normal(shape=(256, 256, 3)) + >> tf.image.adjust_jpeg_quality(x, 75) + ``` + Raises: + InvalidArgumentError: quality must be in [0,100] + InvalidArgumentError: image must have 1 or 3 channels """ with ops.name_scope(name, 'adjust_jpeg_quality', [image]) as name: image = ops.convert_to_tensor(image, name='image') From 78da84dbff806d3982f5edc0cc8926fe75d4c274 Mon Sep 17 00:00:00 2001 From: Imran Salam Date: Thu, 6 Jun 2019 23:46:06 +0500 Subject: [PATCH 0029/3053] Usage example added in image.crop_and_resize Added a usage example in image.crop_and_resize and under image_ops_impl.py. The link to the issue is https://github.com/tensorflow/tensorflow/issues/29507 --- tensorflow/python/ops/image_ops_impl.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index e4f94819ec9..16172455ae6 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -3567,6 +3567,24 @@ def crop_and_resize_v2(image, Returns: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`. + + Usage Example: + ```python + >> import tensorflow as tf + >> BATCH_SIZE = 1 + >> NUM_BOXES = 5 + >> IMAGE_HEIGHT = 256 + >> IMAGE_WIDTH = 256 + >> CHANNELS = 3 + >> CROP_SIZE = (24, 24) + + >> image = tf.random.normal(shape=(BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS) ) + >> boxes = tf.random.uniform(shape=(NUM_BOXES, 4)) + >> box_indices = tf.random.uniform(shape=(NUM_BOXES,), minval=0, maxval=BATCH_SIZE, dtype=tf.int32) + >> output = tf.image.crop_and_resize(image, boxes, box_indices, CROP_SIZE) + >> print(output.shape) + (5, 24, 24, 3) + ``` """ return gen_image_ops.crop_and_resize(image, boxes, box_indices, crop_size, method, extrapolation_value, name) From dbd3c8205a9ccb7b6b55904b2811622d554412a0 Mon Sep 17 00:00:00 2001 From: Lukas Folle Date: Thu, 13 Jun 2019 21:49:12 +0200 Subject: [PATCH 0030/3053] Revert "Doc for maximum improved" This reverts commit 79225715bd7544716de4b8a7655657ae1f6ef249. --- tensorflow/python/keras/activations.py | 3 +++ tensorflow/python/keras/backend.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py index 5f7ade6ea49..fe64485d0c9 100644 --- a/tensorflow/python/keras/activations.py +++ b/tensorflow/python/keras/activations.py @@ -277,6 +277,9 @@ def linear(x): Returns: The linear activation: `x`. + + Note: + Often used as last layer of regression networks. """ return x diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py index bf0b7364335..5d73eb78ed6 100644 --- a/tensorflow/python/keras/backend.py +++ b/tensorflow/python/keras/backend.py @@ -2245,7 +2245,7 @@ def maximum(x, y): y: Tensor or variable. Returns: - A tensor with the element wise maximum value(s) of `x` and `y. + A tensor with the maximum value(s) of `x` and `y. Examples: ```python From e4562b4664dfd433ce52347eb5f5748231494cad Mon Sep 17 00:00:00 2001 From: Lukas Folle Date: Thu, 13 Jun 2019 21:52:37 +0200 Subject: [PATCH 0031/3053] Revert "Revert "Doc for maximum improved"" This reverts commit dbd3c8205a9ccb7b6b55904b2811622d554412a0. --- tensorflow/python/keras/activations.py | 3 --- tensorflow/python/keras/backend.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py index fe64485d0c9..5f7ade6ea49 100644 --- a/tensorflow/python/keras/activations.py +++ b/tensorflow/python/keras/activations.py @@ -277,9 +277,6 @@ def linear(x): Returns: The linear activation: `x`. - - Note: - Often used as last layer of regression networks. """ return x diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py index 5d73eb78ed6..bf0b7364335 100644 --- a/tensorflow/python/keras/backend.py +++ b/tensorflow/python/keras/backend.py @@ -2245,7 +2245,7 @@ def maximum(x, y): y: Tensor or variable. Returns: - A tensor with the maximum value(s) of `x` and `y. + A tensor with the element wise maximum value(s) of `x` and `y. Examples: ```python From ac24620229764b052d8e61f0fc5f9c164516661e Mon Sep 17 00:00:00 2001 From: Greg Peatfield Date: Tue, 18 Jun 2019 17:19:25 -0400 Subject: [PATCH 0032/3053] Link to paper updated. Old link to paper was broken recently. --- .../image/python/ops/single_image_random_dot_stereograms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py index 2b0bcf64019..dfc6af3e558 100755 --- a/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py +++ b/tensorflow/contrib/image/python/ops/single_image_random_dot_stereograms.py @@ -48,7 +48,7 @@ def single_image_random_dot_stereograms(depth_values, corrupt the encode 3-D data within the image. Based upon [this - paper](http://www.learningace.com/doc/4331582/b6ab058d1e206d68ab60e4e1ead2fe6e/sirds-paper). + paper](https://www.cs.waikato.ac.nz/~ihw/papers/94-HWT-SI-IHW-SIRDS-paper.pdf). This outputs a SIRDS image as picture_out.png: From 61ce785eede101a3a5e77c5d0fd88507bd5f455f Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 19 Jun 2019 19:56:49 +0000 Subject: [PATCH 0033/3053] Add bool support for unique_with_counts This fix tries to address the issue raised in 29863 where unique_with_counts does not support bool dtype yet. This fix fixes 29863. Signed-off-by: Yong Tang --- tensorflow/core/kernels/unique_op.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/core/kernels/unique_op.cc b/tensorflow/core/kernels/unique_op.cc index adf84bae49c..4968284c721 100644 --- a/tensorflow/core/kernels/unique_op.cc +++ b/tensorflow/core/kernels/unique_op.cc @@ -237,6 +237,7 @@ class UniqueOp : public OpKernel { UniqueOp) TF_CALL_REAL_NUMBER_TYPES(REGISTER_UNIQUE); REGISTER_UNIQUE(string) +REGISTER_UNIQUE(bool) #undef REGISTER_UNIQUE // Fake integer GPU kernels so that the use of Unique in optimizers (to From f2a2a21169660e70eb109b0c5dba534e43094f56 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 19 Jun 2019 19:57:36 +0000 Subject: [PATCH 0034/3053] Add test cases for bool dtype with unique/unique_with_counts Signed-off-by: Yong Tang --- .../python/kernel_tests/unique_op_test.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py index f203263e0c5..dce5a2a4ad4 100644 --- a/tensorflow/python/kernel_tests/unique_op_test.py +++ b/tensorflow/python/kernel_tests/unique_op_test.py @@ -88,6 +88,28 @@ class UniqueTest(test.TestCase): for i in range(len(x)): self.assertEqual(x[i], tf_y[tf_idx[i]]) + def testBool(self): + x = np.random.choice([True, False], size=7000) + with self.cached_session() as sess: + y, idx = array_ops.unique(x) + tf_y, tf_idx = self.evaluate([y, idx]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + + def testBoolV2(self): + x = np.random.choice([True, False], size=7000) + with self.cached_session() as sess: + y, idx = gen_array_ops.unique_v2(x, axis=np.array([], np.int32)) + tf_y, tf_idx = self.evaluate([y, idx]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + class UniqueWithCountsTest(test.TestCase): @@ -166,6 +188,33 @@ class UniqueWithCountsTest(test.TestCase): for value, count in zip(tf_y, tf_count): self.assertEqual(count, np.sum(x == value)) + def testBool(self): + x = np.random.choice([True, False], size=7000) + with self.cached_session() as sess: + y, idx, count = array_ops.unique_with_counts(x) + tf_y, tf_idx, tf_count = self.evaluate([y, idx, count]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + for value, count in zip(tf_y, tf_count): + self.assertEqual(count, np.sum(x == value)) + + def testBoolV2(self): + x = np.random.choice([True, False], size=7000) + with self.cached_session() as sess: + y, idx, count = gen_array_ops.unique_with_counts_v2( + x, axis=np.array([], np.int32)) + tf_y, tf_idx, tf_count = self.evaluate([y, idx, count]) + + self.assertEqual(len(x), len(tf_idx)) + self.assertEqual(len(tf_y), len(np.unique(x))) + for i in range(len(x)): + self.assertEqual(x[i], tf_y[tf_idx[i]]) + for value, count in zip(tf_y, tf_count): + self.assertEqual(count, np.sum(x == value)) + if __name__ == '__main__': test.main() From 87a5d1c548611a21f375fb48413917329e9c0b2f Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Wed, 19 Jun 2019 14:45:55 -0700 Subject: [PATCH 0035/3053] Added support to CUDNN Rnn V2 in Keras APIs --- .../python/keras/layers/cudnn_recurrent.py | 41 +++++++++++++------ 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py index 68ac8b7b277..cec614f087a 100644 --- a/tensorflow/python/keras/layers/cudnn_recurrent.py +++ b/tensorflow/python/keras/layers/cudnn_recurrent.py @@ -19,6 +19,7 @@ from __future__ import division from __future__ import print_function import collections +import os from tensorflow.python.framework import constant_op from tensorflow.python.keras import backend as K @@ -293,13 +294,20 @@ class CuDNNGRU(_CuDNNRNN): ], shape=self._vector_shape) - outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn( - inputs, - input_h=input_h, - input_c=0, - params=params, - is_training=True, - rnn_mode='gru') + use_cudnn_v2 = os.environ.get("TF_CUDNN_RNN_USE_V2", "0") + args = { + "input": inputs, + "input_h": input_h, + "input_c": 0, + "params": params, + "is_training": True, + "rnn_mode": 'gru', + } + + if use_cudnn_v2 != "1": + outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args) + else: + outputs, h, _, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args) if self.stateful or self.return_state: h = h[0] @@ -492,12 +500,19 @@ class CuDNNLSTM(_CuDNNRNN): ], shape=self._vector_shape) - outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn( - inputs, - input_h=input_h, - input_c=input_c, - params=params, - is_training=True) + use_cudnn_v2 = os.environ.get("TF_CUDNN_RNN_USE_V2", "0") + args = { + "input": inputs, + "input_h": input_h, + "input_c": input_c, + "params": params, + "is_training": True, + } + + if use_cudnn_v2 != "1": + outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args) + else: + outputs, h, c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args) if self.stateful or self.return_state: h = h[0] From bb1fd4c786a7f4f7697973caf01ddd5f53316a03 Mon Sep 17 00:00:00 2001 From: Lukas Folle Date: Thu, 20 Jun 2019 10:27:09 +0200 Subject: [PATCH 0036/3053] Revert changes from different branch. --- tensorflow/python/keras/activations.py | 3 --- tensorflow/python/keras/backend.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py index fe64485d0c9..5f7ade6ea49 100644 --- a/tensorflow/python/keras/activations.py +++ b/tensorflow/python/keras/activations.py @@ -277,9 +277,6 @@ def linear(x): Returns: The linear activation: `x`. - - Note: - Often used as last layer of regression networks. """ return x diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py index 5d73eb78ed6..bf0b7364335 100644 --- a/tensorflow/python/keras/backend.py +++ b/tensorflow/python/keras/backend.py @@ -2245,7 +2245,7 @@ def maximum(x, y): y: Tensor or variable. Returns: - A tensor with the maximum value(s) of `x` and `y. + A tensor with the element wise maximum value(s) of `x` and `y. Examples: ```python From 6317682634d1b4cffbda02af1c3c0bd7c1afe8f0 Mon Sep 17 00:00:00 2001 From: Lukas Folle Date: Thu, 20 Jun 2019 10:28:11 +0200 Subject: [PATCH 0037/3053] Improved documentation by adding several examples. --- tensorflow/python/keras/backend.py | 86 +++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 13 deletions(-) diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py index bf0b7364335..c6fb305ae5b 100644 --- a/tensorflow/python/keras/backend.py +++ b/tensorflow/python/keras/backend.py @@ -2245,19 +2245,7 @@ def maximum(x, y): y: Tensor or variable. Returns: - A tensor with the element wise maximum value(s) of `x` and `y. - - Examples: - ```python - # maximum of two tensors - >>> x = tf.Variable([[1, 2], [3, 4]]) - >>> y = tf.Variable([[2, 1], [0, -1]]) - >>> m = tf.keras.backend.maximum(x, y) - >>> m - - ``` + A tensor. """ return math_ops.maximum(x, y) @@ -2506,6 +2494,17 @@ def concatenate(tensors, axis=-1): Returns: A tensor. + + Example: + ```python + >>> a = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + >>> b = tf.constant([[10, 20, 30], [40, 50, 60], [70, 80, 90]]) + >>> tf.keras.backend.concatenate((a, b), axis=1) + + ``` """ if axis < 0: rank = ndim(tensors[0]) @@ -2530,6 +2529,21 @@ def reshape(x, shape): Returns: A tensor. + + Example: + ```python + >>> a = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) + >>> a + + >>> tf.keras.backend.reshape(a, shape=(2, 6)) + + ``` """ return array_ops.reshape(x, shape) @@ -2545,6 +2559,22 @@ def permute_dimensions(x, pattern): Returns: A tensor. + + Example: + ```python + >>> a = tf.constant([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) + >>> a + + >>> tf.keras.backend.permute_dimensions(a, pattern=(1, 0)) + + ``` """ return array_ops.transpose(x, perm=pattern) @@ -2656,6 +2686,13 @@ def repeat_elements(x, rep, axis): Returns: A tensor. + + Example: + ```python + >>> b = tf.constant([1, 2, 3]) + >>> tf.keras.backend.repeat_elements(b, rep=2, axis=0) + + ``` """ x_shape = x.shape.as_list() # For static axis @@ -2708,6 +2745,22 @@ def repeat(x, n): Returns: A tensor. + + Example: + ```python + >>> b = tf.constant([[1, 2], [3, 4]]) + >>> b + + >>> tf.keras.backend.repeat(b, n=2) + + ``` """ assert ndim(x) == 2 x = array_ops.expand_dims(x, 1) @@ -2735,6 +2788,13 @@ def arange(start, stop=None, step=1, dtype='int32'): Returns: An integer tensor. + Example: + ```python + >>> tf.keras.backend.arange(start=0, stop=10, step=1.5) + + + ``` + """ # Match the behavior of numpy and Theano by returning an empty sequence. if stop is None and start < 0: From 3c8189eddd07b70b8af3f98c048c0ebe5e7415fe Mon Sep 17 00:00:00 2001 From: Lukas Folle Date: Thu, 20 Jun 2019 10:39:52 +0200 Subject: [PATCH 0038/3053] Added two more examples. --- tensorflow/python/keras/backend.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py index c6fb305ae5b..bb98dfc28a4 100644 --- a/tensorflow/python/keras/backend.py +++ b/tensorflow/python/keras/backend.py @@ -2831,6 +2831,17 @@ def flatten(x): Returns: A tensor, reshaped into 1-D + + Example: + ```python + >>> b = tf.constant([[1, 2], [3, 4]]) + >>> b + + >>> tf.keras.backend.flatten(b) + + ``` """ return array_ops.reshape(x, [-1]) @@ -2992,6 +3003,19 @@ def stack(x, axis=0): Returns: A tensor. + + Example: + ```python + >>> a = tf.constant([[1, 2],[3, 4]]) + >>> b = tf.constant([[10, 20],[30, 40]]) + >>> tf.keras.backend.stack((a, b)) + + ``` """ return array_ops.stack(x, axis=axis) From ca602c05456e364787bc00090fc4dc52f5e7bdd8 Mon Sep 17 00:00:00 2001 From: "candy.dc" Date: Fri, 21 Jun 2019 15:22:21 +0800 Subject: [PATCH 0039/3053] Fix: API`init_from_checkpoint` Restore Op placement --- .../framework/python/framework/checkpoint_utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py index 6dd887edf59..811df7a55ae 100644 --- a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py +++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py @@ -21,6 +21,7 @@ from __future__ import print_function import six +from tensorflow.python.framework import ops from tensorflow.python.ops import io_ops from tensorflow.python.ops import state_ops from tensorflow.python.ops import variable_scope as vs @@ -116,9 +117,10 @@ def _set_checkpoint_initializer(variable, file_pattern, tensor_name, slice_spec, name: Name of the operation. """ base_type = variable.dtype.base_dtype - restore_op = io_ops.restore_v2( - file_pattern, [tensor_name], [slice_spec], [base_type], name=name)[0] - variable._initializer_op = state_ops.assign(variable, restore_op) + with ops.device(variable.device), ops.device("/cpu:0"): + restore_op = io_ops.restore_v2( + file_pattern, [tensor_name], [slice_spec], [base_type], name=name)[0] + variable._initializer_op = state_ops.assign(variable, restore_op) def _set_variable_or_list_initializer(variable_or_list, file_pattern, From 78359545a6803236f96bae6cb92bf600a599f963 Mon Sep 17 00:00:00 2001 From: frreiss Date: Sat, 22 Jun 2019 16:53:34 -0700 Subject: [PATCH 0040/3053] Fix linter warnings --- tensorflow/python/ops/check_ops.py | 50 +++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py index 28c41750cd6..fcf8a344d00 100644 --- a/tensorflow/python/ops/check_ops.py +++ b/tensorflow/python/ops/check_ops.py @@ -89,15 +89,28 @@ def _shape_and_dtype_str(tensor): def _unary_assert_doc(sym, sym_name): """ - Common docstring for assert_* ops that evaluate a unary predicate over every + Common docstring for assert_* ops that evaluate a unary predicate over every element of a tensor. Args: sym: Mathematical symbol for the check performed on each element, i.e. "> 0" sym_name: English-language name for the op described by sym + + Returns: + Decorator that adds the appropriate docstring to the function for symbol + `sym`. """ def _decorator(func): + """ + Generated decorator that adds the appropriate docstring to the function for + symbol `sym`. + + Args: + func: Function for a TensorFlow op + + Returns a version of `func` with documentation attached. + """ opname = func.__name__ cap_sym_name = sym_name.capitalize() @@ -146,8 +159,21 @@ def _binary_assert_doc(sym): Args: sym: Binary operation symbol, i.e. "==" + + Returns a decorator that adds the appropriate docstring to the function for + symbol `sym`. """ def _decorator(func): + """ + Generated decorator that adds the appropriate docstring to the function for + symbol `sym`. + + Args: + func: Function for a TensorFlow op + + Returns: + A version of `func` with documentation attached. + """ opname = func.__name__ func.__doc__ = """ @@ -198,7 +224,8 @@ def _make_assert_msg_data(sym, x, y, summarize, test_op): Args: sym: Mathematical symbol for the test to apply to pairs of tensor elements, i.e. "==" - x, y: Inputs to the assertion after convert_to_tensor() + x: First input to the assertion after applying `convert_to_tensor()` + y: Second input to the assertion summarize: Value of the "summarize" parameter to the original assert_* call; tells how many elements of each tensor to print. test_op: TensorFlow op that returns a Boolean tensor with True in each @@ -248,14 +275,15 @@ def _make_assert_msg_data(sym, x, y, summarize, test_op): def _pretty_print(data_item, summarize): """ Format a data item for use in an error message in eager mode. - + Args: data_item: One of the items in the "data" argument to an assert_* function. Can be a Tensor or a scalar value. summarize: How many elements to retain of each tensor-valued entry in data. - Returns an appropriate string representation of data_item + Returns: + An appropriate string representation of data_item """ if isinstance(data_item, ops.Tensor): arr = data_item.numpy() @@ -289,11 +317,17 @@ def _binary_assert(sym, opname, op_func, static_func, inputs to the assertion, will return a Boolean ndarray with containing True in all positions where the assertion PASSES. i.e. lambda x,y: (x == y) for assert_equal() - x, y, data, summarize, message, name: See doc in _binary_assert_doc - above. + x: Numeric `Tensor`. + y: Numeric `Tensor`, same dtype as and broadcastable to `x`. + data: The tensors to print out if the condition is False. Defaults to + error message and first few entries of `x`, `y`. + summarize: Print this many entries of each tensor. + message: A string to prefix to the default message. + name: A name for this operation (optional). Defaults to the value of + `opname`. Returns: - See doc in _binary_assert_doc(). + See docstring template in _binary_assert_doc(). """ with ops.name_scope(name, opname, [x, y, data]): x = ops.convert_to_tensor(x, name='x') @@ -304,7 +338,7 @@ def _binary_assert(sym, opname, op_func, static_func, condition = math_ops.reduce_all(test_op) if condition: return - + # If we get here, the assertion has failed. # Default to printing 3 elements like control_flow_ops.Assert (used # by graph mode) does. Also treat negative values as "print From 5b733714410c3dc740f6590d4b1e9c4c0ac4a050 Mon Sep 17 00:00:00 2001 From: frreiss Date: Sat, 22 Jun 2019 17:05:00 -0700 Subject: [PATCH 0041/3053] Change 'Returns' to 'Returns:' just in case --- tensorflow/python/ops/check_ops.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py index fcf8a344d00..951d5e8e6b7 100644 --- a/tensorflow/python/ops/check_ops.py +++ b/tensorflow/python/ops/check_ops.py @@ -109,7 +109,8 @@ def _unary_assert_doc(sym, sym_name): Args: func: Function for a TensorFlow op - Returns a version of `func` with documentation attached. + Returns: + Version of `func` with documentation attached. """ opname = func.__name__ cap_sym_name = sym_name.capitalize() @@ -160,7 +161,8 @@ def _binary_assert_doc(sym): Args: sym: Binary operation symbol, i.e. "==" - Returns a decorator that adds the appropriate docstring to the function for + Returns: + Decorator that adds the appropriate docstring to the function for symbol `sym`. """ def _decorator(func): From 206e5bbdb409bff2ef6d9f71a74d64f5d504b76c Mon Sep 17 00:00:00 2001 From: Fredrik Knutsson Date: Tue, 25 Jun 2019 09:46:23 +0200 Subject: [PATCH 0042/3053] Adding instructions on how to run CMSIS-NN opt kernels using mbed Change-Id: I31812627f95de1f8dea5704d5880cc1ffcd132cc --- tensorflow/lite/experimental/micro/README.md | 39 +++++++++++++++++--- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/tensorflow/lite/experimental/micro/README.md b/tensorflow/lite/experimental/micro/README.md index b8ed89d552c..102d1a00aa2 100644 --- a/tensorflow/lite/experimental/micro/README.md +++ b/tensorflow/lite/experimental/micro/README.md @@ -366,17 +366,46 @@ optimizations and link it with the microlite lib. To utilize the CMSIS-NN optimized kernels, choose your target, e.g. Bluepill, and build with: -make -f tensorflow/lite/experimental/micro/tools/make/Makefile TAGS=cmsis-nn -TARGET=bluepill test +``` +make -f tensorflow/lite/experimental/micro/tools/make/Makefile TAGS=cmsis-nn TARGET=bluepill test +``` That will build the microlite lib including CMSIS-NN optimized kernels based on the version downloaded by 'download_dependencies.sh', so make sure you have run this script. If you want to utilize another version of CMSIS, clone it to a custom location run the following command: -make -f tensorflow/lite/experimental/micro/tools/make/Makefile -CMSIS_PATH= TAGS=cmsis-nn TARGET=bluepill test (--- Under -development, it will build, but test will fail ---) +``` +make -f tensorflow/lite/experimental/micro/tools/make/Makefile CMSIS_PATH= TAGS=cmsis-nn TARGET=bluepill test +``` + +To test the optimized kernel(s) on your target platform using mbed (depthwise +conv in this example), follow these steps: + +1. Clone CMSIS to a custom location () url: + https://github.com/ARM-software/CMSIS_5.git Make sure you're on the + development branch. +2. Generate the project for depthwise conv mbed test: +``` +make -f tensorflow/lite/experimental/micro/tools/make/Makefile TAGS=cmsis-nn CMSIS_PATH= generate_depthwise_conv_test_mbed_project +``` +3. Go to the generated mbed folder: +``` +cd tensorflow/lite/experimental/micro/tools/make/gen/linux_x86_64/prj/depthwise_conv_test/mbed +``` +4. Follow the steps in README_MBED.md to setup the environment. Or simply do: + ``` +mbed config root . +mbed deploy +python -c 'import fileinput, glob; +for filename in glob.glob("mbed-os/tools/profiles/*.json"): + for line in fileinput.input(filename, inplace=True): + print(line.replace("\"-std=gnu++98\"","\"-std=gnu++11\", \"-fpermissive\""))' +``` +7. Compile and flash. The 'auto' flag requires your target to be plugged in. +``` +mbed compile -m auto -t GCC_ARM -f --source . --source /CMSIS/NN/Include --source /CMSIS/NN/Source/ConvolutionFunctions --source /CMSIS/DSP/Include --source /CMSIS/Core/Include -j8 +``` ## Goals From 201300a095cb389423497e808380b18ccce07fc8 Mon Sep 17 00:00:00 2001 From: Pete Blacker Date: Fri, 28 Jun 2019 17:32:56 +0100 Subject: [PATCH 0043/3053] Added built and test support for the Leon 3 processor to the TensorFlow lite micro framework --- .../experimental/micro/testing/leon_commands | 3 ++ .../micro/testing/test_leon_binary.sh | 48 +++++++++++++++++++ .../tools/make/targets/leon_makefile.inc | 9 ++++ 3 files changed, 60 insertions(+) create mode 100644 tensorflow/lite/experimental/micro/testing/leon_commands create mode 100755 tensorflow/lite/experimental/micro/testing/test_leon_binary.sh create mode 100644 tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc diff --git a/tensorflow/lite/experimental/micro/testing/leon_commands b/tensorflow/lite/experimental/micro/testing/leon_commands new file mode 100644 index 00000000000..5deb5f5dbc0 --- /dev/null +++ b/tensorflow/lite/experimental/micro/testing/leon_commands @@ -0,0 +1,3 @@ +run +quit + diff --git a/tensorflow/lite/experimental/micro/testing/test_leon_binary.sh b/tensorflow/lite/experimental/micro/testing/test_leon_binary.sh new file mode 100755 index 00000000000..d40bf149ccb --- /dev/null +++ b/tensorflow/lite/experimental/micro/testing/test_leon_binary.sh @@ -0,0 +1,48 @@ +#!/bin/bash -e +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# +# Tests a LEON 3 ELF binary by executing it using the TSIM emulator and parsing +# the log output. +# +# First argument is the binary location. +# Second argument is a regular expression that's required to be in the output logs +# for the test to pass. + +declare -r ROOT_DIR=`pwd` +declare -r TEST_TMPDIR=/tmp/test_bluepill_binary/ +declare -r MICRO_LOG_PATH=${TEST_TMPDIR}/$1 +declare -r MICRO_LOG_FILENAME=${MICRO_LOG_PATH}/logs.txt +declare -r LEON_ +mkdir -p ${MICRO_LOG_PATH} + +# Get the location of this script file as an absolute path +SCRIPT_PATH="`dirname \"$BASH_SOURCE\"`" +SCRIPT_PATH="`( cd \"$SCRIPT_PATH\" && pwd )`" +LEON_COMMANDS="$SCRIPT_PATH/leon_commands" + +echo "pwd is ${ROOT_DIR}" + +tsim-leon3 $1 -c ${LEON_COMMANDS} 2>&1 | tee ${MICRO_LOG_FILENAME} + +if grep -q "$2" ${MICRO_LOG_FILENAME} +then + echo "$1: PASS" + exit 0 +else + echo "$1: FAIL - '$2' not found in logs." + exit 1 +fi + diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc new file mode 100644 index 00000000000..fc8673d1268 --- /dev/null +++ b/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc @@ -0,0 +1,9 @@ +# Settings for x86 on Linux +ifeq ($(TARGET), leon) + PLATFORM_FLAGS = -O3 -mcpu=leon3 + CXXFLAGS += -std=c++11 $(PLATFORM_FLAGS) + CCFLAGS += $(PLATFORM_FLAGS) + TARGET_ARCH := leon + TARGET_TOOLCHAIN_PREFIX := sparc-gaisler-elf- + TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_leon_binary.sh +endif From 1b8786471f49d6f13ce237524e694a81ca930957 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 30 Jun 2019 10:06:20 +0000 Subject: [PATCH 0044/3053] Fix KeyError when validation_data was given as a dict This fix tries to address the issue raised in 30122 where a KeyError was thrown when validation_data was given as a dict during the mode.fit. This fix fixes the issue. Thisfix fixes 30122. Signed-off-by: Yong Tang --- tensorflow/python/keras/engine/training_arrays.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py index bc8944a0a08..206c8aefdb2 100644 --- a/tensorflow/python/keras/engine/training_arrays.py +++ b/tensorflow/python/keras/engine/training_arrays.py @@ -207,7 +207,8 @@ def model_iteration(model, val_samples_or_steps = validation_steps else: # Get num samples for printing. - val_samples_or_steps = val_inputs and val_inputs[0].shape[0] or None + vals = val_inputs.values() if isinstance(val_inputs, dict) else val_inputs + val_samples_or_steps = vals and vals[0].shape[0] or None if mode == ModeKeys.TRAIN and verbose: _print_train_info(num_samples_or_steps, val_samples_or_steps, is_dataset) From 49f1a478e0c8eb1311679457c3d648395ab51202 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 30 Jun 2019 10:16:58 +0000 Subject: [PATCH 0045/3053] Test case for GitHub issue 30122 Signed-off-by: Yong Tang --- .../keras/engine/training_arrays_test.py | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tensorflow/python/keras/engine/training_arrays_test.py b/tensorflow/python/keras/engine/training_arrays_test.py index 280c3699ee4..943fc0d343e 100644 --- a/tensorflow/python/keras/engine/training_arrays_test.py +++ b/tensorflow/python/keras/engine/training_arrays_test.py @@ -110,6 +110,43 @@ class PrintTrainingInfoTest(parameterized.TestCase): if do_validation: self.assertIn(", validate on 50 samples", mock_stdout.getvalue()) + def test_dict_input(self): + """Test case for GitHub issue 30122.""" + train_input_0 = np.random.rand(1000, 1) + train_input_1 = np.random.rand(1000, 1) + train_labels = np.random.rand(1000, 1) + val_input_0 = np.random.rand(1000, 1) + val_input_1 = np.random.rand(1000, 1) + val_labels = np.random.rand(1000, 1) + + input_0 = keras.Input(shape=(None,), name='input_0') + input_1 = keras.Input(shape=(None,), name='input_1') + + class my_model(keras.Model): + def __init__(self): + super(my_model, self).__init__(self) + self.hidden_layer_0 = keras.layers.Dense(100, activation="relu") + self.hidden_layer_1 = keras.layers.Dense(100, activation="relu") + self.concat = keras.layers.Concatenate() + self.out_layer = keras.layers.Dense(1, activation="sigmoid") + + def call(self, inputs=[input_0, input_1]): + activation_0 = self.hidden_layer_0(inputs['input_0']) + activation_1 = self.hidden_layer_1(inputs['input_1']) + concat = self.concat([activation_0, activation_1]) + return self.out_layer(concat) + + model = my_model() + model.compile(loss="mae", optimizer="adam") + + mock_stdout = six.StringIO() + with test.mock.patch.object(sys, "stdout", mock_stdout): + model.fit( + x={'input_0': train_input_0, 'input_1': train_input_1}, + y=train_labels, + validation_data=( + {'input_0': val_input_0, 'input_1': val_input_1}, val_labels)) + if __name__ == "__main__": test.main() From 7d2750d03697da5343ede22192b7762c1a83f724 Mon Sep 17 00:00:00 2001 From: Lukas Folle Date: Sun, 30 Jun 2019 13:12:50 +0200 Subject: [PATCH 0046/3053] Fixed empty lines. --- tensorflow/python/keras/backend.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py index b95e3e2ad17..3e1cc87eee9 100644 --- a/tensorflow/python/keras/backend.py +++ b/tensorflow/python/keras/backend.py @@ -2755,7 +2755,6 @@ def repeat(x, n): ``` @@ -3010,7 +3009,6 @@ def stack(x, axis=0): ``` From 2e4d3951eb618a7c34d5e629fc2506ea2a62b4a7 Mon Sep 17 00:00:00 2001 From: Gabriel <18050620+gabriel-vanzandycke@users.noreply.github.com> Date: Mon, 1 Jul 2019 15:53:56 +0200 Subject: [PATCH 0047/3053] Correct Tensor order for dilation2D `gen_nn_ops.dilation2d` seems to be in `NHWC` while the parent function was asking for `NCHW`. I corrected the doc and the check. --- tensorflow/python/ops/nn_ops.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index 418a34fce50..3dbd54592c2 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -280,7 +280,7 @@ def dilation2d_v2( tensor. Must be: `[1, stride_height, stride_width, 1]`. padding: A `string` from: `"SAME", "VALID"`. The type of padding algorithm to use. - data_format: A `string`, only `"NCHW"` is currently supported. + data_format: A `string`, only `"NHWC"` is currently supported. dilations: A list of `ints` that has length `>= 4`. The input stride for atrous morphological dilation. Must be: `[1, rate_height, rate_width, 1]`. @@ -289,8 +289,8 @@ def dilation2d_v2( Returns: A `Tensor`. Has the same type as `input`. """ - if data_format != "NCHW": - raise ValueError("Data formats other than NCHW are not yet supported") + if data_format != "NHWC": + raise ValueError("Data formats other than NHWC are not yet supported") return gen_nn_ops.dilation2d(input=input, filter=filters, From 949216e4c5c704c249bd09469b807211df84efd7 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Fri, 28 Jun 2019 15:35:54 +0000 Subject: [PATCH 0048/3053] Adding ROCm support for the relu op --- tensorflow/core/kernels/relu_op.cc | 7 ++++++- tensorflow/core/kernels/relu_op_gpu.cu.cc | 25 +++++++++++++++-------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/tensorflow/core/kernels/relu_op.cc b/tensorflow/core/kernels/relu_op.cc index e67695d54af..83ef50a2b97 100644 --- a/tensorflow/core/kernels/relu_op.cc +++ b/tensorflow/core/kernels/relu_op.cc @@ -74,7 +74,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_RELU_KERNELS); TF_CALL_GPU_NUMBER_TYPES(REGISTER_ELU_KERNELS); #undef REGISTER_ELU_KERNELS -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM // Forward declarations of the functor specializations for GPU. namespace functor { #define DECLARE_GPU_SPEC(T) \ @@ -143,11 +143,14 @@ namespace functor { typename TTypes::Tensor backprops); \ extern template struct SeluGrad; +#if GOOGLE_CUDA +// TODO(rocm) : qint8 datatype currently not supported on the ROCm platform template <> void Relu::operator()( const GPUDevice& d, typename TTypes::ConstTensor features, typename TTypes::Tensor activations); extern template struct Relu; +#endif // GOOGLE_CUDA TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); } // namespace functor @@ -188,6 +191,7 @@ TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPEC); TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); #undef REGISTER_GPU_KERNELS +#if GOOGLE_CUDA template class ReluOp : public UnaryElementWiseOp> { @@ -210,6 +214,7 @@ REGISTER_KERNEL_BUILDER( ReluOp); #endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM #ifdef TENSORFLOW_USE_SYCL // Registration of the GPU implementations. diff --git a/tensorflow/core/kernels/relu_op_gpu.cu.cc b/tensorflow/core/kernels/relu_op_gpu.cu.cc index 2ade89b7ff5..38784d5f60f 100644 --- a/tensorflow/core/kernels/relu_op_gpu.cu.cc +++ b/tensorflow/core/kernels/relu_op_gpu.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU @@ -31,6 +31,11 @@ namespace tensorflow { typedef Eigen::GpuDevice GPUDevice; namespace functor { + +#if GOOGLE_CUDA +// TODO(rocm): disabling this code on the ROCm platform since the references +// to `half2` are leading to compile errors. + // This kernel computes ReluGrad by processing one half2, two fp16, at a time. // It effectively does: backdrops = (feature > 0) ? gradient : 0 // It also tries to use native half2 primitives as much as possible. @@ -104,17 +109,19 @@ struct ReluGrad { if (count == 0) return; int32 half2_count = Eigen::divup(count, 2); constexpr int32 kThreadInBlock = 512; - GpuLaunchConfig config = GetCudaLaunchConfigFixedBlockSize( + GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize( half2_count, d, ReluGradHalfKernel, 0, kThreadInBlock); - TF_CHECK_OK(CudaLaunchKernel( + TF_CHECK_OK(GpuLaunchKernel( ReluGradHalfKernel, config.block_count, config.thread_per_block, 0, d.stream(), gradient.data(), feature.data(), backprop.data(), count)); } }; +#endif // GOOGLE_CUDA +#if GOOGLE_CUDA __global__ void Relu_int8x4_kernel(int vect_count, const int32* input, int32* output) { - CUDA_1D_KERNEL_LOOP(index, vect_count) { + GPU_1D_KERNEL_LOOP(index, vect_count) { output[index] = __vmaxs4(input[index], 0); } } @@ -133,14 +140,15 @@ struct Relu { int32 vect_count = Eigen::divup(count, 4); constexpr int32 kThreadInBlock = 512; - GpuLaunchConfig config = GetCudaLaunchConfigFixedBlockSize( + GpuLaunchConfig config = GetGpuLaunchConfigFixedBlockSize( vect_count, d, Relu_int8x4_kernel, 0, kThreadInBlock); - TF_CHECK_OK(CudaLaunchKernel( + TF_CHECK_OK(GpuLaunchKernel( Relu_int8x4_kernel, config.block_count, config.thread_per_block, 0, d.stream(), vect_count, reinterpret_cast(input.data()), reinterpret_cast(output.data()))); } }; +#endif // GOOGLE_CUDA } // namespace functor @@ -158,9 +166,10 @@ struct Relu { template struct functor::SeluGrad; TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS); - +#if GOOGLE_CUDA template struct functor::Relu; +#endif // GOOGLE_CUDA } // end namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM From 69a449a49fc3c0b1ab08aa26b7990f1cf9c67dd5 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Tue, 25 Jun 2019 15:26:27 +0000 Subject: [PATCH 0049/3053] Adding ROCm support for reduction ops --- tensorflow/core/kernels/reduction_ops.h | 4 ++++ tensorflow/core/kernels/reduction_ops_all.cc | 2 +- tensorflow/core/kernels/reduction_ops_any.cc | 2 +- tensorflow/core/kernels/reduction_ops_common_gpu.h | 4 ++-- tensorflow/core/kernels/reduction_ops_euclidean.cc | 4 +++- tensorflow/core/kernels/reduction_ops_gpu_bool.cu.cc | 4 ++-- tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc | 4 ++-- tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc | 4 ++-- tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc | 4 ++-- tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc | 4 ++-- tensorflow/core/kernels/reduction_ops_half_prod_max_min.cu.cc | 4 ++-- tensorflow/core/kernels/reduction_ops_max.cc | 2 +- tensorflow/core/kernels/reduction_ops_mean.cc | 4 +++- tensorflow/core/kernels/reduction_ops_min.cc | 2 +- tensorflow/core/kernels/reduction_ops_prod.cc | 4 +++- tensorflow/core/kernels/reduction_ops_sum.cc | 4 +++- 16 files changed, 34 insertions(+), 22 deletions(-) diff --git a/tensorflow/core/kernels/reduction_ops.h b/tensorflow/core/kernels/reduction_ops.h index 164359f601a..86cbc241d2a 100644 --- a/tensorflow/core/kernels/reduction_ops.h +++ b/tensorflow/core/kernels/reduction_ops.h @@ -117,6 +117,10 @@ struct Identity { FIX_MEAN_IDENTITY(Eigen::half) FIX_MEAN_IDENTITY(float) FIX_MEAN_IDENTITY(double) +#if GOOGLE_CUDA +FIX_MEAN_IDENTITY(complex64) +FIX_MEAN_IDENTITY(complex128) +#endif #undef FIX_MEAN_IDENTITY template diff --git a/tensorflow/core/kernels/reduction_ops_all.cc b/tensorflow/core/kernels/reduction_ops_all.cc index 4a34c4ef513..70ea87a2dfc 100644 --- a/tensorflow/core/kernels/reduction_ops_all.cc +++ b/tensorflow/core/kernels/reduction_ops_all.cc @@ -30,7 +30,7 @@ REGISTER_KERNEL_BUILDER( .HostMemory("reduction_indices"), ReductionOp); -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM REGISTER_KERNEL_BUILDER( Name("All") .TypeConstraint("Tidx") diff --git a/tensorflow/core/kernels/reduction_ops_any.cc b/tensorflow/core/kernels/reduction_ops_any.cc index 6c0519de95e..cd0ce289e51 100644 --- a/tensorflow/core/kernels/reduction_ops_any.cc +++ b/tensorflow/core/kernels/reduction_ops_any.cc @@ -30,7 +30,7 @@ REGISTER_KERNEL_BUILDER( .HostMemory("reduction_indices"), ReductionOp); -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM REGISTER_KERNEL_BUILDER( Name("Any") .TypeConstraint("Tidx") diff --git a/tensorflow/core/kernels/reduction_ops_common_gpu.h b/tensorflow/core/kernels/reduction_ops_common_gpu.h index 9af43f885f9..2415f1dbc6d 100644 --- a/tensorflow/core/kernels/reduction_ops_common_gpu.h +++ b/tensorflow/core/kernels/reduction_ops_common_gpu.h @@ -15,8 +15,8 @@ limitations under the License. #ifndef TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_GPU_H_ #define TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_GPU_H_ -#if !GOOGLE_CUDA -#error This file must only be included when building with Cuda support +#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM +#error This file must only be included when building with GPU support #endif #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" diff --git a/tensorflow/core/kernels/reduction_ops_euclidean.cc b/tensorflow/core/kernels/reduction_ops_euclidean.cc index 9f4bf50e7ca..cf719e76cd8 100644 --- a/tensorflow/core/kernels/reduction_ops_euclidean.cc +++ b/tensorflow/core/kernels/reduction_ops_euclidean.cc @@ -33,7 +33,7 @@ namespace tensorflow { TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS); #undef REGISTER_CPU_KERNELS -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU_KERNELS(type) \ REGISTER_KERNEL_BUILDER(Name("EuclideanNorm") \ @@ -51,8 +51,10 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS); ReductionOp>); TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); +#if GOOGLE_CUDA TF_CALL_complex64(REGISTER_GPU_KERNELS); TF_CALL_complex128(REGISTER_GPU_KERNELS); +#endif #undef REGISTER_GPU_KERNELS #endif diff --git a/tensorflow/core/kernels/reduction_ops_gpu_bool.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_bool.cu.cc index 79ec1d59dfa..89bcf1d7ced 100644 --- a/tensorflow/core/kernels/reduction_ops_gpu_bool.cu.cc +++ b/tensorflow/core/kernels/reduction_ops_gpu_bool.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU @@ -59,4 +59,4 @@ DEFINE_FOR_TYPE_AND_R(bool, Eigen::internal::OrReducer); } // end namespace functor } // end namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc index c492308a916..c952c4c9fa4 100644 --- a/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc +++ b/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU @@ -67,4 +67,4 @@ DEFINE_FOR_ALL_REDUCERS(double); } // end namespace functor } // end namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc index b006311c125..92f4b9d707c 100644 --- a/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc +++ b/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU @@ -67,4 +67,4 @@ DEFINE_FOR_ALL_REDUCERS(float); } // end namespace functor } // end namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc b/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc index 91a33b92cb6..c35d8c2ec86 100644 --- a/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc +++ b/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU @@ -68,4 +68,4 @@ DEFINE_FOR_ALL_REDUCERS(int64); } // end namespace functor } // end namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc b/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc index f33d504e25a..bbb34c9d3ba 100644 --- a/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc +++ b/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU @@ -64,4 +64,4 @@ DEFINE_FOR_ALL_REDUCERS(Eigen::half); } // end namespace functor } // end namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/reduction_ops_half_prod_max_min.cu.cc b/tensorflow/core/kernels/reduction_ops_half_prod_max_min.cu.cc index 84fd389bb38..d2a180ba351 100644 --- a/tensorflow/core/kernels/reduction_ops_half_prod_max_min.cu.cc +++ b/tensorflow/core/kernels/reduction_ops_half_prod_max_min.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU @@ -64,4 +64,4 @@ DEFINE_FOR_ALL_REDUCERS(Eigen::half); } // end namespace functor } // end namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/reduction_ops_max.cc b/tensorflow/core/kernels/reduction_ops_max.cc index 8bfa44b2d06..fe9775f7f1d 100644 --- a/tensorflow/core/kernels/reduction_ops_max.cc +++ b/tensorflow/core/kernels/reduction_ops_max.cc @@ -33,7 +33,7 @@ namespace tensorflow { TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS); #undef REGISTER_CPU_KERNELS -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ diff --git a/tensorflow/core/kernels/reduction_ops_mean.cc b/tensorflow/core/kernels/reduction_ops_mean.cc index 67c974edda2..d314f1953dc 100644 --- a/tensorflow/core/kernels/reduction_ops_mean.cc +++ b/tensorflow/core/kernels/reduction_ops_mean.cc @@ -33,7 +33,7 @@ namespace tensorflow { TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS); #undef REGISTER_CPU_KERNELS -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ @@ -51,8 +51,10 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS); .HostMemory("reduction_indices"), \ ReductionOp>); TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); +#if GOOGLE_CUDA TF_CALL_complex64(REGISTER_GPU_KERNELS); TF_CALL_complex128(REGISTER_GPU_KERNELS); +#endif #undef REGISTER_GPU_KERNELS #endif diff --git a/tensorflow/core/kernels/reduction_ops_min.cc b/tensorflow/core/kernels/reduction_ops_min.cc index 5c537c5b9c7..9f1feae969e 100644 --- a/tensorflow/core/kernels/reduction_ops_min.cc +++ b/tensorflow/core/kernels/reduction_ops_min.cc @@ -33,7 +33,7 @@ namespace tensorflow { TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS); #undef REGISTER_CPU_KERNELS -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ diff --git a/tensorflow/core/kernels/reduction_ops_prod.cc b/tensorflow/core/kernels/reduction_ops_prod.cc index e9b23df7460..0642bad9218 100644 --- a/tensorflow/core/kernels/reduction_ops_prod.cc +++ b/tensorflow/core/kernels/reduction_ops_prod.cc @@ -33,7 +33,7 @@ namespace tensorflow { TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS); #undef REGISTER_CPU_KERNELS -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU_KERNELS(type) \ REGISTER_KERNEL_BUILDER(Name("Prod") \ @@ -52,8 +52,10 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS); Eigen::internal::ProdReducer>); TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); TF_CALL_int32(REGISTER_GPU_KERNELS); +#if GOOGLE_CUDA TF_CALL_complex64(REGISTER_GPU_KERNELS); TF_CALL_complex128(REGISTER_GPU_KERNELS); +#endif #undef REGISTER_GPU_KERNELS #endif diff --git a/tensorflow/core/kernels/reduction_ops_sum.cc b/tensorflow/core/kernels/reduction_ops_sum.cc index cf0d0f5c714..d79684df290 100644 --- a/tensorflow/core/kernels/reduction_ops_sum.cc +++ b/tensorflow/core/kernels/reduction_ops_sum.cc @@ -33,7 +33,7 @@ namespace tensorflow { TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS); #undef REGISTER_CPU_KERNELS -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU_KERNELS(type) \ REGISTER_KERNEL_BUILDER( \ @@ -52,8 +52,10 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS); ReductionOp>); TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); TF_CALL_int64(REGISTER_GPU_KERNELS); +#if GOOGLE_CUDA TF_CALL_complex64(REGISTER_GPU_KERNELS); TF_CALL_complex128(REGISTER_GPU_KERNELS); +#endif #undef REGISTER_GPU_KERNELS // A special GPU kernel for int32. From 20055ea79163639bbc304c211459d91a0ab3c8f1 Mon Sep 17 00:00:00 2001 From: Pete Blacker Date: Wed, 3 Jul 2019 15:06:54 +0100 Subject: [PATCH 0050/3053] Added Leon compiler and emulator to third party downloads. --- .../experimental/micro/tools/make/targets/leon_makefile.inc | 5 +++++ .../experimental/micro/tools/make/third_party_downloads.inc | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc index fc8673d1268..06dd99edcfc 100644 --- a/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc +++ b/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc @@ -6,4 +6,9 @@ ifeq ($(TARGET), leon) TARGET_ARCH := leon TARGET_TOOLCHAIN_PREFIX := sparc-gaisler-elf- TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_leon_binary.sh + GCC_LEON := $(MAKEFILE_DIR)/downloads/leon_bcc2/ + + $(eval $(call add_third_party_download,$(LEON_BCC2_URL),$(LEON_BCC2_MD5),leon_bcc2,)) + $(eval $(call add_third_party_download,$(TSIM_URL),$(TSIM_MD5),tsim,)) + endif diff --git a/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc index f27cb682273..40d5359392f 100644 --- a/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc +++ b/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc @@ -14,6 +14,12 @@ else GCC_EMBEDDED_MD5 := "299ebd3f1c2c90930d28ab82e5d8d6c0" endif +LEON_BCC2_URL := "https://www.gaisler.com/anonftp/bcc2/bin/bcc-2.0.7-gcc-linux64.tar.xz" +LEON_BCC2_MD5 := "cdf78082be4882da2a92c9baa82fe765" + +TSIM_URL := "https://www.gaisler.com/anonftp/tsim/tsim-eval-2.0.63.tar.gz" +TSIM_MD5 := "afa0095d3ed989a949e1467f94e41d2f" + CMSIS_URL := "https://github.com/ARM-software/CMSIS_5/archive/5.4.0.zip" CMSIS_MD5 := "f451f1dccc844e894939055db278a40e" From 1a2236e7ca31c93b33a9e7c5dbd6af33ac82713c Mon Sep 17 00:00:00 2001 From: Pete Blacker Date: Wed, 3 Jul 2019 15:09:36 +0100 Subject: [PATCH 0051/3053] Added support for big endian systems to TFL micro: weights are converted to big endian on startup, several accesses are made endian safe and non-aligned safe. --- .../experimental/micro/micro_interpreter.cc | 54 +++++++++++++++++-- .../experimental/micro/micro_interpreter.h | 5 ++ .../micro/simple_tensor_allocator.cc | 40 +++++++++++++- tensorflow/lite/kernels/kernel_util.h | 11 ++-- 4 files changed, 100 insertions(+), 10 deletions(-) diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.cc b/tensorflow/lite/experimental/micro/micro_interpreter.cc index 49ec03e85f4..000ee2b254c 100644 --- a/tensorflow/lite/experimental/micro/micro_interpreter.cc +++ b/tensorflow/lite/experimental/micro/micro_interpreter.cc @@ -18,6 +18,8 @@ limitations under the License. #include "tensorflow/lite/core/api/flatbuffer_conversions.h" #include "tensorflow/lite/experimental/micro/compatibility.h" +#include // ############### temp debugging + namespace tflite { namespace { const int kStackDataAllocatorSize = 128; @@ -78,7 +80,6 @@ MicroInterpreter::MicroInterpreter(const Model* model, subgraph_ = (*subgraphs)[0]; tensors_ = subgraph_->tensors(); operators_ = subgraph_->operators(); - context_.tensors_size = tensors_->size(); context_.tensors = reinterpret_cast(tensor_allocator_->AllocateMemory( @@ -86,16 +87,45 @@ MicroInterpreter::MicroInterpreter(const Model* model, context_.impl_ = static_cast(this); context_.ReportError = ReportOpError; context_.recommended_num_threads = 1; - initialization_status_ = AllocateInputAndActTensors(); if (initialization_status_ != kTfLiteOk) { return; } - initialization_status_ = AllocateTemporaryTensors(); if (initialization_status_ != kTfLiteOk) { return; } + // If the system is big endian then convert weights from the flatbuffer from little to big endian + // on startup so that it does not need to be done during inference. + if (!FLATBUFFERS_LITTLEENDIAN) { + for (int t=0; tallocation_type == kTfLiteMmapRo) + CorrectTensorEndianness(thisTensor); + } + } +} + +void MicroInterpreter::CorrectTensorEndianness(TfLiteTensor *tensorCorr) { + int32_t tensorSize = 1; + for (int d=0; ddims->size; ++d) + tensorSize *= ((const int32_t*)tensorCorr->dims->data)[d]; + + switch(tensorCorr->type) { + case TfLiteType::kTfLiteFloat32: CorrectTensorDataEndianness(tensorCorr->data.f, tensorSize); break; + case TfLiteType::kTfLiteFloat16: CorrectTensorDataEndianness(tensorCorr->data.f16, tensorSize); break; + case TfLiteType::kTfLiteInt64: CorrectTensorDataEndianness(tensorCorr->data.i64, tensorSize); break; + case TfLiteType::kTfLiteInt32: CorrectTensorDataEndianness(tensorCorr->data.i32, tensorSize); break; + case TfLiteType::kTfLiteInt16: CorrectTensorDataEndianness(tensorCorr->data.i16, tensorSize); break; + case TfLiteType::kTfLiteComplex64: CorrectTensorDataEndianness(tensorCorr->data.c64, tensorSize); break; + } +} + +template +void MicroInterpreter::CorrectTensorDataEndianness(T *data, int32_t size) { + for (int32_t i=0; i(temporaries_data); temporaries_array->size = 0; + const int kWeights = 1; + + //printf("Index of Weights input of this operation is [%d]", flatbuffers::EndianScalar(inputs_array->data[kWeights])); + + //TfLiteTensor *t_test = &context_.tensors[flatbuffers::EndianScalar(inputs_array->data[kWeights])]; + + //printf("Testing a weights tensor instance. is variable? %d\n", (int)(t_test->is_variable)); + TfLiteNode node; node.inputs = inputs_array; node.outputs = outputs_array; @@ -301,4 +339,14 @@ TfLiteTensor* MicroInterpreter::output(int index) { return &(context_.tensors[outputs->Get(index)]); } +TfLiteTensor* MicroInterpreter::tensor(int index) { + const size_t length = tensors_size(); + if ((index < 0) || (index >= tensors_size())) { + error_reporter_->Report("Tensor index %d out of range (length is %d)", + index, length); + return nullptr; + } + return &context_.tensors[index]; +} + } // namespace tflite diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.h b/tensorflow/lite/experimental/micro/micro_interpreter.h index 04d9c7cba8d..34e1228c87c 100644 --- a/tensorflow/lite/experimental/micro/micro_interpreter.h +++ b/tensorflow/lite/experimental/micro/micro_interpreter.h @@ -56,6 +56,11 @@ class MicroInterpreter { TfLiteStatus AllocateInputAndActTensors(); TfLiteStatus AllocateTemporaryTensors(); + void CorrectTensorEndianness(TfLiteTensor *tensorCorr); + + template + void CorrectTensorDataEndianness(T *data, int32_t size); + const Model* model_; const OpResolver& op_resolver_; SimpleTensorAllocator* tensor_allocator_; diff --git a/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc b/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc index 8da498e904b..efd6574b23e 100644 --- a/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc +++ b/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc @@ -79,10 +79,12 @@ TfLiteStatus SimpleTensorAllocator::AllocateTensor( int destroy_after, const flatbuffers::Vector>* buffers, ErrorReporter* error_reporter, TfLiteTensor* result) { + //printf("Alloc 1\n"); fflush(stdout); TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(), &result->type, error_reporter)); result->is_variable = flatbuffer_tensor.is_variable(); + //printf("Alloc 2\n"); fflush(stdout); result->data.raw = nullptr; result->bytes = 0; if (auto* buffer = (*buffers)[flatbuffer_tensor.buffer()]) { @@ -97,8 +99,11 @@ TfLiteStatus SimpleTensorAllocator::AllocateTensor( } } } + + //printf("Alloc 3\n"); fflush(stdout); if (result->data.raw) { result->allocation_type = kTfLiteMmapRo; + //printf("Alloc mapped to RO memory area.\n"); fflush(stdout); } else { int data_size = 1; for (int n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) { @@ -108,6 +113,7 @@ TfLiteStatus SimpleTensorAllocator::AllocateTensor( TF_LITE_ENSURE_STATUS(BytesRequired(flatbuffer_tensor, data_size, &result->bytes, &type_size, error_reporter)); + //printf("Allocating [%d] bytes for tensor.", (data_size * type_size)); fflush(stdout); result->data.raw = reinterpret_cast(AllocateMemory(result->bytes, type_size)); if (result->data.raw == nullptr) { @@ -115,6 +121,7 @@ TfLiteStatus SimpleTensorAllocator::AllocateTensor( if (tensor_name == nullptr) { tensor_name = ""; } + //printf("tensor name without implicit bool conversion is \"%s\".zn", tensor_name); fflush(stdout); error_reporter->Report( "Couldn't allocate memory for tensor '%s', wanted %d bytes but only " "%d were available", @@ -123,26 +130,55 @@ TfLiteStatus SimpleTensorAllocator::AllocateTensor( } result->allocation_type = kTfLiteArenaRw; } + + //printf("Alloc 4\n"); fflush(stdout); result->dims = reinterpret_cast(AllocateMemory( sizeof(int) * (flatbuffer_tensor.shape()->Length() + 1), sizeof(int))); result->dims->size = flatbuffer_tensor.shape()->Length(); + + //printf("Alloc 5\n"); fflush(stdout); for (int n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) { result->dims->data[n] = flatbuffer_tensor.shape()->Get(n); } + + //printf("Alloc 6\n"); fflush(stdout); const auto* src_quantization = flatbuffer_tensor.quantization(); + //printf("Alloc 7, bump\n"); fflush(stdout); if (src_quantization && src_quantization->scale() && (src_quantization->scale()->size() > 0) && src_quantization->zero_point() && (src_quantization->zero_point()->size() > 0)) { + //printf("Made it into if body.\n"); fflush(stdout); result->params.scale = src_quantization->scale()->Get(0); - result->params.zero_point = src_quantization->zero_point()->Get(0); + //printf("Scale is %f", result->params.scale); fflush(stdout); + + //result->params.zero_point = src_quantization->zero_point()->Get(0); + + //const uint8_t * Data () + + memcpy(&result->params.zero_point, (int64_t*)src_quantization->zero_point()->Data(), sizeof(int64_t)); + + //printf("int64_t sanity check size is %d", sizeof(int64_t)); + + //printf("Zero point bytes [ "); + //for (int b=0; b<8; ++b) + // printf("0x%02X ", *(((unsigned char*)&result->params.zero_point)+b) ); + //printf("]\n"); + + result->params.zero_point = flatbuffers::EndianScalar(result->params.zero_point); + + //printf("zero point is %ld", result->params.zero_point); fflush(stdout); } + //printf("Alloc 8\n"); fflush(stdout); result->allocation = nullptr; - if (flatbuffer_tensor.name()) { + const char *test = flatbuffer_tensor.name()->c_str(); + //printf("name->c_str() is [%d]", (long int)test); fflush(stdout); + if (flatbuffer_tensor.name()->c_str() != nullptr) { // <----- leon fix ??? maybe not :-( result->name = flatbuffer_tensor.name()->c_str(); } else { result->name = ""; } + //printf("Alloc 9, name=\"%s\"\n", result->name); fflush(stdout); result->delegate = nullptr; result->buffer_handle = 0; result->data_is_stale = false; diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h index ab065513e59..38214e0dff4 100644 --- a/tensorflow/lite/kernels/kernel_util.h +++ b/tensorflow/lite/kernels/kernel_util.h @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/c_api_internal.h" +#include "flatbuffers/flatbuffers.h" namespace tflite { @@ -29,20 +30,20 @@ inline int SizeOfDimension(const TfLiteTensor* t, int dim) { } inline const TfLiteTensor* GetInput(TfLiteContext* context, TfLiteNode* node, int index) { - return &context->tensors[node->inputs->data[index]]; + return &context->tensors[flatbuffers::EndianScalar(node->inputs->data[index])]; } inline TfLiteTensor* GetVariableInput(TfLiteContext* context, TfLiteNode* node, int index) { - TfLiteTensor* tensor = &context->tensors[node->inputs->data[index]]; + TfLiteTensor* tensor = &context->tensors[flatbuffers::EndianScalar(node->inputs->data[index])]; return (tensor->is_variable) ? tensor : nullptr; } inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node, int index) { - return &context->tensors[node->outputs->data[index]]; + return &context->tensors[flatbuffers::EndianScalar(node->outputs->data[index])]; } inline TfLiteTensor* GetTemporary(TfLiteContext* context, TfLiteNode* node, int index) { - return &context->tensors[node->temporaries->data[index]]; + return &context->tensors[flatbuffers::EndianScalar(node->temporaries->data[index])]; } inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; } inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; } @@ -60,7 +61,7 @@ inline const TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context, int index) { const bool use_tensor = node->inputs->data[index] != kOptionalTensor; if (use_tensor) { - return &context->tensors[node->inputs->data[index]]; + return &context->tensors[flatbuffers::EndianScalar(node->inputs->data[index])]; // <------------ } return nullptr; } From da1ff92d2e7aa4c787b40554b0dc149fb3ba6b0c Mon Sep 17 00:00:00 2001 From: Pete Blacker Date: Wed, 3 Jul 2019 16:16:57 +0100 Subject: [PATCH 0052/3053] Cleaned up code now passing clang-tidy checks --- .../experimental/micro/micro_interpreter.cc | 51 ++++++++++--------- .../experimental/micro/micro_interpreter.h | 4 +- .../micro/simple_tensor_allocator.cc | 44 ++-------------- tensorflow/lite/kernels/kernel_util.h | 16 ++++-- 4 files changed, 46 insertions(+), 69 deletions(-) diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.cc b/tensorflow/lite/experimental/micro/micro_interpreter.cc index 000ee2b254c..95a2ca49d88 100644 --- a/tensorflow/lite/experimental/micro/micro_interpreter.cc +++ b/tensorflow/lite/experimental/micro/micro_interpreter.cc @@ -18,8 +18,6 @@ limitations under the License. #include "tensorflow/lite/core/api/flatbuffer_conversions.h" #include "tensorflow/lite/experimental/micro/compatibility.h" -#include // ############### temp debugging - namespace tflite { namespace { const int kStackDataAllocatorSize = 128; @@ -95,35 +93,48 @@ MicroInterpreter::MicroInterpreter(const Model* model, if (initialization_status_ != kTfLiteOk) { return; } - // If the system is big endian then convert weights from the flatbuffer from little to big endian - // on startup so that it does not need to be done during inference. + // If the system is big endian then convert weights from the flatbuffer from + // little to big endian on startup so that it does not need to be done during + // inference. if (!FLATBUFFERS_LITTLEENDIAN) { - for (int t=0; tallocation_type == kTfLiteMmapRo) CorrectTensorEndianness(thisTensor); } } } -void MicroInterpreter::CorrectTensorEndianness(TfLiteTensor *tensorCorr) { +void MicroInterpreter::CorrectTensorEndianness(TfLiteTensor* tensorCorr) { int32_t tensorSize = 1; - for (int d=0; ddims->size; ++d) + for (int d = 0; d < tensorCorr->dims->size; ++d) tensorSize *= ((const int32_t*)tensorCorr->dims->data)[d]; - switch(tensorCorr->type) { - case TfLiteType::kTfLiteFloat32: CorrectTensorDataEndianness(tensorCorr->data.f, tensorSize); break; - case TfLiteType::kTfLiteFloat16: CorrectTensorDataEndianness(tensorCorr->data.f16, tensorSize); break; - case TfLiteType::kTfLiteInt64: CorrectTensorDataEndianness(tensorCorr->data.i64, tensorSize); break; - case TfLiteType::kTfLiteInt32: CorrectTensorDataEndianness(tensorCorr->data.i32, tensorSize); break; - case TfLiteType::kTfLiteInt16: CorrectTensorDataEndianness(tensorCorr->data.i16, tensorSize); break; - case TfLiteType::kTfLiteComplex64: CorrectTensorDataEndianness(tensorCorr->data.c64, tensorSize); break; + switch (tensorCorr->type) { + case TfLiteType::kTfLiteFloat32: + CorrectTensorDataEndianness(tensorCorr->data.f, tensorSize); + break; + case TfLiteType::kTfLiteFloat16: + CorrectTensorDataEndianness(tensorCorr->data.f16, tensorSize); + break; + case TfLiteType::kTfLiteInt64: + CorrectTensorDataEndianness(tensorCorr->data.i64, tensorSize); + break; + case TfLiteType::kTfLiteInt32: + CorrectTensorDataEndianness(tensorCorr->data.i32, tensorSize); + break; + case TfLiteType::kTfLiteInt16: + CorrectTensorDataEndianness(tensorCorr->data.i16, tensorSize); + break; + case TfLiteType::kTfLiteComplex64: + CorrectTensorDataEndianness(tensorCorr->data.c64, tensorSize); + break; } } template -void MicroInterpreter::CorrectTensorDataEndianness(T *data, int32_t size) { - for (int32_t i=0; idata[kWeights])); - - //TfLiteTensor *t_test = &context_.tensors[flatbuffers::EndianScalar(inputs_array->data[kWeights])]; - - //printf("Testing a weights tensor instance. is variable? %d\n", (int)(t_test->is_variable)); - TfLiteNode node; node.inputs = inputs_array; node.outputs = outputs_array; diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.h b/tensorflow/lite/experimental/micro/micro_interpreter.h index 34e1228c87c..3f9fd6ec482 100644 --- a/tensorflow/lite/experimental/micro/micro_interpreter.h +++ b/tensorflow/lite/experimental/micro/micro_interpreter.h @@ -56,10 +56,10 @@ class MicroInterpreter { TfLiteStatus AllocateInputAndActTensors(); TfLiteStatus AllocateTemporaryTensors(); - void CorrectTensorEndianness(TfLiteTensor *tensorCorr); + void CorrectTensorEndianness(TfLiteTensor* tensorCorr); template - void CorrectTensorDataEndianness(T *data, int32_t size); + void CorrectTensorDataEndianness(T* data, int32_t size); const Model* model_; const OpResolver& op_resolver_; diff --git a/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc b/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc index efd6574b23e..ad2327cf39a 100644 --- a/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc +++ b/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc @@ -79,12 +79,9 @@ TfLiteStatus SimpleTensorAllocator::AllocateTensor( int destroy_after, const flatbuffers::Vector>* buffers, ErrorReporter* error_reporter, TfLiteTensor* result) { - //printf("Alloc 1\n"); fflush(stdout); TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(), &result->type, error_reporter)); result->is_variable = flatbuffer_tensor.is_variable(); - - //printf("Alloc 2\n"); fflush(stdout); result->data.raw = nullptr; result->bytes = 0; if (auto* buffer = (*buffers)[flatbuffer_tensor.buffer()]) { @@ -99,11 +96,8 @@ TfLiteStatus SimpleTensorAllocator::AllocateTensor( } } } - - //printf("Alloc 3\n"); fflush(stdout); if (result->data.raw) { result->allocation_type = kTfLiteMmapRo; - //printf("Alloc mapped to RO memory area.\n"); fflush(stdout); } else { int data_size = 1; for (int n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) { @@ -113,7 +107,6 @@ TfLiteStatus SimpleTensorAllocator::AllocateTensor( TF_LITE_ENSURE_STATUS(BytesRequired(flatbuffer_tensor, data_size, &result->bytes, &type_size, error_reporter)); - //printf("Allocating [%d] bytes for tensor.", (data_size * type_size)); fflush(stdout); result->data.raw = reinterpret_cast(AllocateMemory(result->bytes, type_size)); if (result->data.raw == nullptr) { @@ -121,7 +114,6 @@ TfLiteStatus SimpleTensorAllocator::AllocateTensor( if (tensor_name == nullptr) { tensor_name = ""; } - //printf("tensor name without implicit bool conversion is \"%s\".zn", tensor_name); fflush(stdout); error_reporter->Report( "Couldn't allocate memory for tensor '%s', wanted %d bytes but only " "%d were available", @@ -130,55 +122,29 @@ TfLiteStatus SimpleTensorAllocator::AllocateTensor( } result->allocation_type = kTfLiteArenaRw; } - - //printf("Alloc 4\n"); fflush(stdout); result->dims = reinterpret_cast(AllocateMemory( sizeof(int) * (flatbuffer_tensor.shape()->Length() + 1), sizeof(int))); result->dims->size = flatbuffer_tensor.shape()->Length(); - - //printf("Alloc 5\n"); fflush(stdout); for (int n = 0; n < flatbuffer_tensor.shape()->Length(); ++n) { result->dims->data[n] = flatbuffer_tensor.shape()->Get(n); } - - //printf("Alloc 6\n"); fflush(stdout); const auto* src_quantization = flatbuffer_tensor.quantization(); - //printf("Alloc 7, bump\n"); fflush(stdout); if (src_quantization && src_quantization->scale() && (src_quantization->scale()->size() > 0) && src_quantization->zero_point() && (src_quantization->zero_point()->size() > 0)) { - //printf("Made it into if body.\n"); fflush(stdout); result->params.scale = src_quantization->scale()->Get(0); - //printf("Scale is %f", result->params.scale); fflush(stdout); - - //result->params.zero_point = src_quantization->zero_point()->Get(0); - - //const uint8_t * Data () - - memcpy(&result->params.zero_point, (int64_t*)src_quantization->zero_point()->Data(), sizeof(int64_t)); - - //printf("int64_t sanity check size is %d", sizeof(int64_t)); - - //printf("Zero point bytes [ "); - //for (int b=0; b<8; ++b) - // printf("0x%02X ", *(((unsigned char*)&result->params.zero_point)+b) ); - //printf("]\n"); - - result->params.zero_point = flatbuffers::EndianScalar(result->params.zero_point); - - //printf("zero point is %ld", result->params.zero_point); fflush(stdout); + memcpy(&result->params.zero_point, + (int64_t*)src_quantization->zero_point()->Data(), sizeof(int64_t)); + result->params.zero_point = + flatbuffers::EndianScalar(result->params.zero_point); } - //printf("Alloc 8\n"); fflush(stdout); result->allocation = nullptr; - const char *test = flatbuffer_tensor.name()->c_str(); - //printf("name->c_str() is [%d]", (long int)test); fflush(stdout); - if (flatbuffer_tensor.name()->c_str() != nullptr) { // <----- leon fix ??? maybe not :-( + if (flatbuffer_tensor.name()->c_str() != nullptr) { result->name = flatbuffer_tensor.name()->c_str(); } else { result->name = ""; } - //printf("Alloc 9, name=\"%s\"\n", result->name); fflush(stdout); result->delegate = nullptr; result->buffer_handle = 0; result->data_is_stale = false; diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h index 38214e0dff4..d21f8ea452a 100644 --- a/tensorflow/lite/kernels/kernel_util.h +++ b/tensorflow/lite/kernels/kernel_util.h @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/c_api_internal.h" + #include "flatbuffers/flatbuffers.h" namespace tflite { @@ -30,20 +31,24 @@ inline int SizeOfDimension(const TfLiteTensor* t, int dim) { } inline const TfLiteTensor* GetInput(TfLiteContext* context, TfLiteNode* node, int index) { - return &context->tensors[flatbuffers::EndianScalar(node->inputs->data[index])]; + return &context + ->tensors[flatbuffers::EndianScalar(node->inputs->data[index])]; } inline TfLiteTensor* GetVariableInput(TfLiteContext* context, TfLiteNode* node, int index) { - TfLiteTensor* tensor = &context->tensors[flatbuffers::EndianScalar(node->inputs->data[index])]; + TfLiteTensor* tensor = + &context->tensors[flatbuffers::EndianScalar(node->inputs->data[index])]; return (tensor->is_variable) ? tensor : nullptr; } inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node, int index) { - return &context->tensors[flatbuffers::EndianScalar(node->outputs->data[index])]; + return &context + ->tensors[flatbuffers::EndianScalar(node->outputs->data[index])]; } inline TfLiteTensor* GetTemporary(TfLiteContext* context, TfLiteNode* node, int index) { - return &context->tensors[flatbuffers::EndianScalar(node->temporaries->data[index])]; + return &context->tensors[flatbuffers::EndianScalar( + node->temporaries->data[index])]; } inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; } inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; } @@ -61,7 +66,8 @@ inline const TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context, int index) { const bool use_tensor = node->inputs->data[index] != kOptionalTensor; if (use_tensor) { - return &context->tensors[flatbuffers::EndianScalar(node->inputs->data[index])]; // <------------ + return &context + ->tensors[flatbuffers::EndianScalar(node->inputs->data[index])]; } return nullptr; } From cafbcc3e51aecc49c0874cc3c490fccbb25e3538 Mon Sep 17 00:00:00 2001 From: Pete Blacker Date: Wed, 3 Jul 2019 16:26:09 +0100 Subject: [PATCH 0053/3053] Cleaned up a few comments and removed redundant code --- tensorflow/lite/experimental/micro/micro_interpreter.cc | 2 -- tensorflow/lite/experimental/micro/testing/test_leon_binary.sh | 2 -- .../experimental/micro/tools/make/targets/leon_makefile.inc | 2 +- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.cc b/tensorflow/lite/experimental/micro/micro_interpreter.cc index 95a2ca49d88..1a1c132e16c 100644 --- a/tensorflow/lite/experimental/micro/micro_interpreter.cc +++ b/tensorflow/lite/experimental/micro/micro_interpreter.cc @@ -284,8 +284,6 @@ TfLiteStatus MicroInterpreter::Invoke() { reinterpret_cast(temporaries_data); temporaries_array->size = 0; - const int kWeights = 1; - TfLiteNode node; node.inputs = inputs_array; node.outputs = outputs_array; diff --git a/tensorflow/lite/experimental/micro/testing/test_leon_binary.sh b/tensorflow/lite/experimental/micro/testing/test_leon_binary.sh index d40bf149ccb..6a84322e1d4 100755 --- a/tensorflow/lite/experimental/micro/testing/test_leon_binary.sh +++ b/tensorflow/lite/experimental/micro/testing/test_leon_binary.sh @@ -33,8 +33,6 @@ SCRIPT_PATH="`dirname \"$BASH_SOURCE\"`" SCRIPT_PATH="`( cd \"$SCRIPT_PATH\" && pwd )`" LEON_COMMANDS="$SCRIPT_PATH/leon_commands" -echo "pwd is ${ROOT_DIR}" - tsim-leon3 $1 -c ${LEON_COMMANDS} 2>&1 | tee ${MICRO_LOG_FILENAME} if grep -q "$2" ${MICRO_LOG_FILENAME} diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc index 06dd99edcfc..1504a09d1b8 100644 --- a/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc +++ b/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc @@ -1,4 +1,4 @@ -# Settings for x86 on Linux +# Settings for SparcV8 based LEON processors from Gaisler Aeroflex ifeq ($(TARGET), leon) PLATFORM_FLAGS = -O3 -mcpu=leon3 CXXFLAGS += -std=c++11 $(PLATFORM_FLAGS) From 742a9ac2869af42becc845490f035cb82a2aa22e Mon Sep 17 00:00:00 2001 From: Pete Blacker Date: Wed, 3 Jul 2019 16:28:38 +0100 Subject: [PATCH 0054/3053] Sorted out whitespace --- tensorflow/lite/experimental/micro/micro_interpreter.cc | 4 ++++ tensorflow/lite/experimental/micro/simple_tensor_allocator.cc | 1 + 2 files changed, 5 insertions(+) diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.cc b/tensorflow/lite/experimental/micro/micro_interpreter.cc index 1a1c132e16c..3dc83edf458 100644 --- a/tensorflow/lite/experimental/micro/micro_interpreter.cc +++ b/tensorflow/lite/experimental/micro/micro_interpreter.cc @@ -78,6 +78,7 @@ MicroInterpreter::MicroInterpreter(const Model* model, subgraph_ = (*subgraphs)[0]; tensors_ = subgraph_->tensors(); operators_ = subgraph_->operators(); + context_.tensors_size = tensors_->size(); context_.tensors = reinterpret_cast(tensor_allocator_->AllocateMemory( @@ -85,14 +86,17 @@ MicroInterpreter::MicroInterpreter(const Model* model, context_.impl_ = static_cast(this); context_.ReportError = ReportOpError; context_.recommended_num_threads = 1; + initialization_status_ = AllocateInputAndActTensors(); if (initialization_status_ != kTfLiteOk) { return; } + initialization_status_ = AllocateTemporaryTensors(); if (initialization_status_ != kTfLiteOk) { return; } + // If the system is big endian then convert weights from the flatbuffer from // little to big endian on startup so that it does not need to be done during // inference. diff --git a/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc b/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc index ad2327cf39a..16eb01ecd4d 100644 --- a/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc +++ b/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc @@ -82,6 +82,7 @@ TfLiteStatus SimpleTensorAllocator::AllocateTensor( TF_LITE_ENSURE_STATUS(ConvertTensorType(flatbuffer_tensor.type(), &result->type, error_reporter)); result->is_variable = flatbuffer_tensor.is_variable(); + result->data.raw = nullptr; result->bytes = 0; if (auto* buffer = (*buffers)[flatbuffer_tensor.buffer()]) { From c5ea82b214479a87bd18a45dc0fa8d67545b408c Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Wed, 3 Jul 2019 10:38:51 -0500 Subject: [PATCH 0055/3053] Initial commit to introduce ROCm-Device-Libs into TensorFlow ROCm build ROCm-Device-Libs is used by XLA on ROCm for various device intrinsics. --- tensorflow/core/BUILD | 34 ++++++++++++++++++ .../core/platform/default/build_config.bzl | 9 +++++ .../core/platform/default/rocm_rocdl_path.cc | 32 +++++++++++++++++ tensorflow/core/platform/rocm_rocdl_path.cc | 26 ++++++++++++++ tensorflow/core/platform/rocm_rocdl_path.h | 32 +++++++++++++++++ .../core/platform/rocm_rocdl_path_test.cc | 36 +++++++++++++++++++ 6 files changed, 169 insertions(+) create mode 100644 tensorflow/core/platform/default/rocm_rocdl_path.cc create mode 100644 tensorflow/core/platform/rocm_rocdl_path.cc create mode 100644 tensorflow/core/platform/rocm_rocdl_path.h create mode 100644 tensorflow/core/platform/rocm_rocdl_path_test.cc diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index c0cb57a6499..27b73a03e36 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -132,6 +132,9 @@ load( "tf_additional_numa_lib_defines", "tf_additional_proto_hdrs", "tf_additional_proto_srcs", + "tf_additional_rocdl_data", + "tf_additional_rocdl_deps", + "tf_additional_rocdl_srcs", "tf_additional_test_deps", "tf_additional_test_srcs", "tf_additional_verbs_lib_defines", @@ -155,6 +158,7 @@ load( "if_dynamic_kernels", "if_static", "tf_cuda_tests_tags", + "tf_gpu_tests_tags", ) load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") load("@local_config_tensorrt//:build_defs.bzl", "if_tensorrt") @@ -1845,6 +1849,7 @@ filegroup( # :platform_base, a common dependency for downstream targets. "platform/**/env_time.cc", "platform/**/logging.cc", + "platform/**/rocm_rocdl_path.*", "platform/default/test_benchmark.*", "platform/cuda.h", "platform/rocm.h", @@ -2521,6 +2526,7 @@ cc_library( "platform/**/logger.cc", "platform/**/logging.cc", "platform/**/human_readable_json.cc", + "platform/**/rocm_rocdl_path.cc", "platform/abi.cc", "platform/protobuf.cc", ], @@ -2537,6 +2543,8 @@ cc_library( "platform/**/logger.cc", "platform/**/logging.cc", "platform/**/human_readable_json.cc", + "platform/**/rocm.h", + "platform/**/rocm_rocdl_path.cc", "platform/abi.cc", ] + # Protobuf deps already included through the ":lib_proto_parsing" @@ -4581,6 +4589,20 @@ tf_cuda_cc_test( ], ) +tf_cc_test_gpu( + name = "rocm_rocdl_path_test", + size = "small", + srcs = ["platform/rocm_rocdl_path_test.cc"], + linkstatic = tf_kernel_tests_linkstatic(), + tags = tf_gpu_tests_tags(), + deps = [ + ":rocm_rocdl_path", + ":lib", + ":test", + ":test_main", + ], +) + tf_cuda_only_cc_test( name = "util_gpu_kernel_helper_test", srcs = [ @@ -5557,6 +5579,18 @@ cc_library( ] + tf_additional_libdevice_deps(), ) +cc_library( + name = "rocm_rocdl_path", + srcs = ["platform/rocm_rocdl_path.cc"] + tf_additional_rocdl_srcs(), + hdrs = ["platform/rocm_rocdl_path.h"], + copts = tf_copts(), + data = tf_additional_rocdl_data(), + visibility = ["//visibility:public"], + deps = [ + ":lib", + ] + tf_additional_rocdl_deps(), +) + transitive_hdrs( name = "headers", visibility = ["//tensorflow:__subpackages__"], diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl index 7acba90ad22..43561a17ac2 100644 --- a/tensorflow/core/platform/default/build_config.bzl +++ b/tensorflow/core/platform/default/build_config.bzl @@ -626,6 +626,15 @@ def tf_additional_libdevice_deps(): def tf_additional_libdevice_srcs(): return ["platform/default/cuda_libdevice_path.cc"] +def tf_additional_rocdl_data(): + return [] + +def tf_additional_rocdl_deps(): + return ["@local_config_rocm//rocm:rocm_headers"] + +def tf_additional_rocdl_srcs(): + return ["platform/default/rocm_rocdl_path.cc"] + def tf_additional_test_deps(): return [] diff --git a/tensorflow/core/platform/default/rocm_rocdl_path.cc b/tensorflow/core/platform/default/rocm_rocdl_path.cc new file mode 100644 index 00000000000..3525b6c9b34 --- /dev/null +++ b/tensorflow/core/platform/default/rocm_rocdl_path.cc @@ -0,0 +1,32 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/platform/rocm_rocdl_path.h" + +#include + +#if !defined(PLATFORM_GOOGLE) +#include "rocm/rocm_config.h" +#endif +#include "tensorflow/core/platform/logging.h" + +namespace tensorflow { + +string ROCmRoot() { + VLOG(3) << "ROCM root = " << TF_ROCM_TOOLKIT_PATH; + return TF_ROCM_TOOLKIT_PATH; +} + +} // namespace tensorflow diff --git a/tensorflow/core/platform/rocm_rocdl_path.cc b/tensorflow/core/platform/rocm_rocdl_path.cc new file mode 100644 index 00000000000..1e69da85b65 --- /dev/null +++ b/tensorflow/core/platform/rocm_rocdl_path.cc @@ -0,0 +1,26 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/platform/rocm_rocdl_path.h" + +#include "tensorflow/core/lib/io/path.h" + +namespace tensorflow { + +string ROCDLRoot() { + return tensorflow::io::JoinPath(tensorflow::ROCmRoot(), "hcc/lib"); +} + +} // namespace tensorflow diff --git a/tensorflow/core/platform/rocm_rocdl_path.h b/tensorflow/core/platform/rocm_rocdl_path.h new file mode 100644 index 00000000000..92b119fe816 --- /dev/null +++ b/tensorflow/core/platform/rocm_rocdl_path.h @@ -0,0 +1,32 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_ROCM_ROCDL_PATH_H_ +#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_ROCM_ROCDL_PATH_H_ + +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { + +// Returns the root directory of the ROCM SDK, which contains sub-folders such +// as bin, lib, and rocdl. +string ROCmRoot(); + +// Returns the directory that contains ROCm-Device-Libs files in the ROCm SDK. +string ROCDLRoot(); + +} // namespace tensorflow + +#endif // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_ROCM_ROCDL_PATH_H_ diff --git a/tensorflow/core/platform/rocm_rocdl_path_test.cc b/tensorflow/core/platform/rocm_rocdl_path_test.cc new file mode 100644 index 00000000000..3565d3a7f95 --- /dev/null +++ b/tensorflow/core/platform/rocm_rocdl_path_test.cc @@ -0,0 +1,36 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/platform/rocm_rocdl_path.h" + +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { + +#if TENSORFLOW_USE_ROCM +TEST(ROCmROCDLPathTest, ROCDLPath) { + VLOG(2) << "ROCm-Deivce-Libs root = " << ROCDLRoot(); + std::vector rocdl_files; + TF_EXPECT_OK(Env::Default()->GetMatchingPaths( + io::JoinPath(ROCDLRoot(), "*.amdgcn.bc"), + &rocdl_files)); + EXPECT_LT(0, rocdl_files.size()); +} +#endif + +} // namespace tensorflow From 306b9ad0b7e192abdc64c14426c3a93d84e41c69 Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Wed, 3 Jul 2019 16:11:33 +0000 Subject: [PATCH 0056/3053] Tame upstream Ubuntu Makefile check --- tensorflow/core/platform/default/rocm_rocdl_path.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/platform/default/rocm_rocdl_path.cc b/tensorflow/core/platform/default/rocm_rocdl_path.cc index 3525b6c9b34..00a50be16d1 100644 --- a/tensorflow/core/platform/default/rocm_rocdl_path.cc +++ b/tensorflow/core/platform/default/rocm_rocdl_path.cc @@ -18,7 +18,7 @@ limitations under the License. #include #if !defined(PLATFORM_GOOGLE) -#include "rocm/rocm_config.h" +#include "third_party/gpus/rocm/rocm_config.h" #endif #include "tensorflow/core/platform/logging.h" From 82a696d1ac0e10ab64e42dce370ccf765de96e9f Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Wed, 3 Jul 2019 16:14:27 +0000 Subject: [PATCH 0057/3053] Tame buildifier check --- tensorflow/core/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 27b73a03e36..6361262c720 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -4596,8 +4596,8 @@ tf_cc_test_gpu( linkstatic = tf_kernel_tests_linkstatic(), tags = tf_gpu_tests_tags(), deps = [ - ":rocm_rocdl_path", ":lib", + ":rocm_rocdl_path", ":test", ":test_main", ], From 7691e99586e336c5dc4b7209f355c79019b8cf3e Mon Sep 17 00:00:00 2001 From: amoitra Date: Wed, 3 Jul 2019 12:57:46 -0700 Subject: [PATCH 0058/3053] Enable use of cudnn backprop APIs for grouped convolutions --- .../xla/service/gpu/cudnn_conv_rewriter.cc | 51 +++++++----- .../service/gpu/cudnn_conv_rewriter_test.cc | 80 +++++++++++++++++++ 2 files changed, 111 insertions(+), 20 deletions(-) mode change 100644 => 100755 tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc old mode 100644 new mode 100755 index e81850db69e..21ef810e64b --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc @@ -89,13 +89,11 @@ bool CanImplementAsCudnnForwardConv(HloInstruction* conv) { // Try to match a backward filter pattern that contains "conv". // Precondition: "conv" is a kConvolution. -std::tuple MatchBackwardFilter( - HloInstruction* conv) { +std::tuple +MatchBackwardFilter(HloInstruction* conv) { const auto no_match_result = - std::make_tuple(false, Window(), ConvolutionDimensionNumbers()); - if (conv->feature_group_count() > 1) { - return no_match_result; - } + std::make_tuple(false, Window(), ConvolutionDimensionNumbers(), nullptr); + // Step 1: match the instruction pattern without considering the paddings and // dimension numbers just yet. We may need some generic pattern matcher // similar to third_party/llvm/llvm/include/llvm/IR/PatternMatch.h @@ -248,7 +246,29 @@ std::tuple MatchBackwardFilter( backward_conv_dnums.add_kernel_spatial_dimensions(output_spatial_dims[i]); } - return std::make_tuple(true, backward_conv_window, backward_conv_dnums); + HloInstruction* lhs = conv->mutable_operand(0); + if (conv->feature_group_count() == 1) { + return std::make_tuple(true, backward_conv_window, backward_conv_dnums, + lhs); + } + Shape new_shape = lhs->shape(); + + int64 input_batch_dimension = backward_conv_dnums.input_batch_dimension(); + int64 input_feature_dimension = backward_conv_dnums.input_feature_dimension(); + + int64 input_batch = new_shape.dimensions(input_batch_dimension); + int64 input_feature = new_shape.dimensions(input_feature_dimension); + + // Ensure that input_batch is exact multiple of conv->feature_group_count() + CHECK_EQ(input_batch % conv->feature_group_count(), 0); + new_shape.set_dimensions(input_batch_dimension, + input_batch / conv->feature_group_count()); + new_shape.set_dimensions(input_feature_dimension, + input_feature * conv->feature_group_count()); + + HloComputation* c = conv->parent(); + lhs = c->AddInstruction(HloInstruction::CreateReshape(new_shape, lhs)); + return std::make_tuple(true, backward_conv_window, backward_conv_dnums, lhs); } // Try to match a backward input pattern that contains "conv". @@ -258,15 +278,6 @@ MatchBackwardInput(HloInstruction* conv) { const auto no_match_result = std::make_tuple(false, Window(), ConvolutionDimensionNumbers(), nullptr); - // TODO(b/119479517): Theoretically cuDNN supports grouped convolutions also - // for the backward input convolution, but at least for now with version 7.1.4 - // it is slower. This needs to be re-evaluated for future cuDNN versions. - // Note that we already have the necessary code down below, the only thing to - // enable it is to remove the following early return. - if (conv->feature_group_count() > 1) { - return no_match_result; - } - // Match instruction pattern. CHECK_EQ(HloOpcode::kConvolution, conv->opcode()); HloInstruction* reverse_filter = conv->mutable_operand(1); @@ -503,13 +514,13 @@ StatusOr RunOnInstruction(HloInstruction* conv) { Window window; ConvolutionDimensionNumbers dnums; HloInstruction* rhs; + HloInstruction* lhs; - std::tie(match, window, dnums) = MatchBackwardFilter(conv); + std::tie(match, window, dnums, lhs) = MatchBackwardFilter(conv); if (match) { return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, conv->shape(), - conv->mutable_operand(0), conv->mutable_operand(1), - window, dnums, conv->feature_group_count(), - conv->metadata()); + lhs, conv->mutable_operand(1), window, dnums, + conv->feature_group_count(), conv->metadata()); } std::tie(match, window, dnums, rhs) = MatchBackwardInput(conv); diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc index dbcdc2b075b..362d8d13aab 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter_test.cc @@ -135,6 +135,86 @@ TEST_F(CudnnConvRewriterTest, BackwardFilterConvolve) { << md_after_opt.DebugString() << " vs " << metadata.DebugString(); } +TEST_F(CudnnConvRewriterTest, BackwardFilterGroupConvolve) { + // In a nutshell, before pass: + // Input->batch_dim: 3 input_shape(3) = 4 + // Input->feature_dim: 0 input_shape(0) = 32 + // Kernel(gradient)->kernel_input_feature_dim (gradient_batch_dimension): 0 + // Kernel(gradient)->kernel_output_feature_dim (gradient_feature_dimension): 3 + // Output(dkernel)->output_batch_dim (dkernel_input_feature_dim): 2 + // Output(dkernel)->output_feature_dim (dkernel_output_feature_dim): 3 + + // After pass: All shapes and dimension layout is brought + // back to normal as would be acceptable by cudnn + // Input->batch_dim: 0 input_shape(0) = 8 + // Input->feature_dim: 3 input_shape(3) = 16 + // Kernel(gradient)->kernel_input_feature_dim (gradient_batch_dimension): 2 + // Kernel(gradient)->kernel_output_feature_dim (gradient_feature_dimension): 3 + // Output(dkernel)->output_batch_dim (dkernel_input_feature_dim): 0 + // Output(dkernel)->output_feature_dim (dkernel_output_feature_dim): 3 + HloComputation::Builder builder(TestName()); + HloInstruction* activations = + builder.AddInstruction(HloInstruction::CreateParameter( + 0, ShapeUtil::MakeShape(F32, {32, 1, 3, 4}), "activations")); + HloInstruction* gradients = + builder.AddInstruction(HloInstruction::CreateParameter( + 1, ShapeUtil::MakeShape(F32, {8, 1, 2, 16}), "gradients")); + Window conv_window = default_conv_window_; + conv_window.mutable_dimensions(1)->set_size(2); + conv_window.mutable_dimensions(1)->set_window_dilation(2); + auto* conv = builder.AddInstruction(HloInstruction::CreateConvolve( + ShapeInference::InferConvolveShape( + activations->shape(), gradients->shape(), /*feature_group_count=*/4, + /*batch_group_count=*/1, conv_window, + tf_default_dnums_for_backward_filter_) + .ConsumeValueOrDie(), + activations, gradients, /*feature_group_count=*/4, + /*batch_group_count=*/1, conv_window, + tf_default_dnums_for_backward_filter_, DefaultPrecisionConfig(2))); + OpMetadata metadata; + metadata.set_op_name("bar"); + conv->set_metadata(metadata); + auto module = CreateNewVerifiedModule(); + HloComputation* entry_computation = + module->AddEntryComputation(builder.Build()); + EXPECT_TRUE(RunPass(module.get())); + ASSERT_THAT(entry_computation->root_instruction(), + op::GetTupleElement( + op::CustomCall(kCudnnConvBackwardFilterCallTarget), 0)); + // Check that metadata was preserved. + const auto& md_after_opt = + entry_computation->root_instruction()->operand(0)->metadata(); + EXPECT_TRUE(protobuf_util::ProtobufEquals(md_after_opt, metadata)) + << md_after_opt.DebugString() << " vs " << metadata.DebugString(); + const HloInstruction* custom_call = + entry_computation->root_instruction()->operand(0); + const ConvolutionDimensionNumbers conv_dim = + custom_call->convolution_dimension_numbers(); + const auto lhs_a = custom_call->operand(0); + const auto input_shape = lhs_a->shape(); + // The input (lhs) batch_dim(dim 0 in the original NHWC layout) gets mapped to + // be the feature_dim(dim 3) with a value of N*g = 32 in tf2xla. As described + // in conv_grad_ops.h, this swap is required to implement backprop using fwd + // conv. After the pass the batch_dim gets remapped to dim 0. The batch_dim + // value gets scaled to N = N*g/g = 32/4 = 8 to be compatible with cudnn + EXPECT_EQ(0, conv_dim.input_batch_dimension()); + EXPECT_EQ(8, input_shape.dimensions(conv_dim.input_batch_dimension())); + // Similarly, the input (lhs) feature_dim(dim 3 in the original NHWC layout) + // gets mapped to be the batch_dim(dim 0) with a value of C/g = 4 in tf2xla. + // After the pass the batch_dim gets remapped to dim 0. The feature_dim value + // gets scaled to C = C/g*g = 4*4 = 16 to be compatible with cudnn + EXPECT_EQ(3, conv_dim.input_feature_dimension()); + EXPECT_EQ(16, input_shape.dimensions(conv_dim.input_feature_dimension())); + // Similarly, the feature and batch dims of the incoming gradients (used as + // rhs) and the in/out dims of the output of convolution i.e, dgrad have been + // been modified in tf2xla (as described in conv_grad_ops.h). This pass remaps + // everything back for the layout to be compatible with cudnn backprop APIs. + EXPECT_EQ(2, conv_dim.kernel_input_feature_dimension()); + EXPECT_EQ(3, conv_dim.kernel_output_feature_dimension()); + EXPECT_EQ(0, conv_dim.output_batch_dimension()); + EXPECT_EQ(3, conv_dim.output_feature_dimension()); +} + TEST_F(CudnnConvRewriterTest, BackwardFilterConvolveEquivalentToForwardConvolution) { HloComputation::Builder builder(TestName()); From f74e5e1a1984434397d677d7fd174b8d8fd7670f Mon Sep 17 00:00:00 2001 From: Trevor Morris Date: Wed, 3 Jul 2019 15:22:41 -0700 Subject: [PATCH 0059/3053] Improve Flatten to avoid using dynamic shapes in more situations. Flatten currently creates a reshape which introduces a dependency on the size of the batch dimension, which is not commonly known statically. This means that the constant folding grappler pass cannot resolve the shapes ahead of time. This also makes it difficult to convert using TF-TRT. --- tensorflow/python/keras/layers/core.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py index e28a8e52f15..eb45636e677 100644 --- a/tensorflow/python/keras/layers/core.py +++ b/tensorflow/python/keras/layers/core.py @@ -580,9 +580,13 @@ class Flatten(Layer): permutation.append(1) inputs = array_ops.transpose(inputs, perm=permutation) - outputs = array_ops.reshape( - inputs, (tensor_shape.dimension_value(inputs.shape[0]) or - array_ops.shape(inputs)[0], -1)) + input_shape = tensor_shape.TensorShape(inputs.shape).as_list() + if input_shape and all(input_shape[1:]): + outputs = array_ops.reshape(inputs, (-1, np.prod(input_shape[1:]))) + else: + outputs = array_ops.reshape( + inputs, (tensor_shape.dimension_value(inputs.shape[0]) or + array_ops.shape(inputs)[0], -1)) if not context.executing_eagerly(): outputs.set_shape(self.compute_output_shape(inputs.shape)) return outputs From 0bfa245f5c22b560c729038bfbeb310e0468a23c Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Wed, 3 Jul 2019 23:14:39 +0000 Subject: [PATCH 0060/3053] adding a DISABLED_ON_GPU_ROCM macro to disable subtests that are not yet supported on ROCm. Applying that macro to a few subtests in convolution_test.cc and convolution_variants_test.cc --- .../compiler/xla/tests/convolution_test.cc | 16 ++++++++++++---- .../xla/tests/convolution_variants_test.cc | 8 ++++++-- tensorflow/compiler/xla/tests/test_macros.h | 7 +++++++ 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc index 0ab765aefa0..b58d28ae582 100644 --- a/tensorflow/compiler/xla/tests/convolution_test.cc +++ b/tensorflow/compiler/xla/tests/convolution_test.cc @@ -408,7 +408,9 @@ class Convolve1D_1x2x5_1x2x2_WithPadding : public ConvolutionTest { TYPED_TEST_CASE(Convolve1D_1x2x5_1x2x2_WithPadding, TestTypes); TYPED_TEST(Convolve1D_1x2x5_1x2x2_WithPadding, Types) { this->RunTest(); } -XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) { +// 5D tensors are not yet supported in ROCm +XLA_TEST_F(ConvolutionTest, + DISABLED_ON_GPU_ROCM(Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid)) { XlaBuilder builder(TestName()); std::vector input_dims = {1, 4, 2, 3, 3}; std::vector filter_dims = {2, 2, 2, 3, 3}; @@ -1946,7 +1948,9 @@ XLA_TEST_F(ConvolutionTest, ConvolveF32BackwardInputGroupedConvolution) { class ConvolutionHloTest : public HloTestBase {}; -XLA_TEST_F(ConvolutionHloTest, ConvolveF64Forward) { +// double datatype is not yet supported in ROCm +XLA_TEST_F(ConvolutionHloTest, + DISABLED_ON_GPU_ROCM(DISABLED_ON_CPU(ConvolveF64Forward))) { constexpr char kHlo[] = R"( HloModule TestModule @@ -1970,7 +1974,9 @@ ENTRY Test { EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001})); } -XLA_TEST_F(ConvolutionHloTest, ConvolveF64BackwardFilter) { +// double datatype is not yet supported in ROCm +XLA_TEST_F(ConvolutionHloTest, + DISABLED_ON_GPU_ROCM(DISABLED_ON_CPU(ConvolveF64BackwardFilter))) { constexpr char kHlo[] = R"( HloModule TestModule @@ -1982,7 +1988,9 @@ ENTRY Test { EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001})); } -XLA_TEST_F(ConvolutionHloTest, ConvolveF64BackwardInput) { +// double datatype is not yet supported in ROCm +XLA_TEST_F(ConvolutionHloTest, + DISABLED_ON_GPU_ROCM(DISABLED_ON_CPU(ConvolveF64BackwardInput))) { constexpr char kHlo[] = R"( HloModule TestModule diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc index ba3e9c436e3..ff5503b08e9 100644 --- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc +++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc @@ -1330,7 +1330,9 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding1D) { ComputeAndCompareR3(&builder, {{{13, 24, 130}}}, {}, error_spec_); } -XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) { +// 5D tensors are not yet supported in ROCm +XLA_TEST_F(ConvolutionVariantsTest, + DISABLED_ON_GPU_ROCM(BackwardInputEvenPadding3D)) { XlaBuilder builder(TestName()); auto gradients_flat = LiteralUtil::CreateR1({1}); @@ -1354,7 +1356,9 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) { ComputeAndCompareLiteral(&builder, expected_literal, {}, error_spec_); } -XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) { +// 5D tensors are not yet supported in ROCm +XLA_TEST_F(ConvolutionVariantsTest, + DISABLED_ON_GPU_ROCM(BackwardFilterEvenPadding3D)) { XlaBuilder builder(TestName()); auto activations_flat = LiteralUtil::CreateR1({1, 2, 3, 4}); diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h index 9636df2ff5f..4bbb0d3f9cb 100644 --- a/tensorflow/compiler/xla/tests/test_macros.h +++ b/tensorflow/compiler/xla/tests/test_macros.h @@ -36,6 +36,7 @@ limitations under the License. #define DISABLED_ON_CPU(X) X #define DISABLED_ON_GPU(X) X +#define DISABLED_ON_GPU_ROCM(X) X #define DISABLED_ON_INTERPRETER(X) X // We need this macro instead of pasting directly to support nesting @@ -54,6 +55,12 @@ limitations under the License. #ifdef XLA_TEST_BACKEND_GPU # undef DISABLED_ON_GPU # define DISABLED_ON_GPU(X) XLA_TEST_PASTE(DISABLED_, X) + +#if TENSORFLOW_USE_ROCM +# undef DISABLED_ON_GPU_ROCM +# define DISABLED_ON_GPU_ROCM(X) XLA_TEST_PASTE(DISABLED_, X) +#endif // TENSORFLOW_USE_ROCM + #endif // XLA_TEST_BACKEND_GPU #ifdef XLA_TEST_BACKEND_INTERPRETER From 43917009be8f86c6bfcb1fd029a513eb023cd23a Mon Sep 17 00:00:00 2001 From: Zantares Date: Thu, 4 Jul 2019 13:04:46 +0800 Subject: [PATCH 0061/3053] Replace redundant attribute function with a generic function. --- tensorflow/core/graph/mkl_layout_pass.cc | 515 +++-------------------- 1 file changed, 65 insertions(+), 450 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index df3cf19e2c0..5a4c211c194 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -355,39 +355,38 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // NOTE: names are alphabetically sorted. rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn), - CopyAttrsAddN, AlwaysRewrite, + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add), - CopyAttrsDataType, AlwaysRewrite, + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.avg_pool, mkl_op_registry::GetMklOpName(csinfo_.avg_pool), - CopyAttrsPooling, AlwaysRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.avg_pool_grad, mkl_op_registry::GetMklOpName(csinfo_.avg_pool_grad), - CopyAttrsPooling, AlwaysRewrite, + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.avg_pool3d, mkl_op_registry::GetMklOpName(csinfo_.avg_pool3d), - CopyAttrsPooling, AlwaysRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.avg_pool3d_grad, mkl_op_registry::GetMklOpName(csinfo_.avg_pool3d_grad), - CopyAttrsPooling, AlwaysRewrite, + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.batch_matmul, mkl_op_registry::GetMklOpName(csinfo_.batch_matmul), - CopyAttrsBatchMatMul, AlwaysRewrite, - kRewriteForOpNameChange}); + CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange}); rinfo_.push_back( {csinfo_.concat, mkl_op_registry::GetMklOpName(csinfo_.concat), - CopyAttrsConcat, AlwaysRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.concatv2, mkl_op_registry::GetMklOpName(csinfo_.concatv2), - CopyAttrsConcatV2, AlwaysRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.conjugate_transpose, mkl_op_registry::GetMklOpName(csinfo_.conjugate_transpose), - CopyAttrsTranspose, AlwaysRewrite, kRewriteForOpNameChange}); + CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange}); rinfo_.push_back({csinfo_.conv2d, mkl_op_registry::GetMklOpName(csinfo_.conv2d), CopyAttrsConvCheckConstFilter, AlwaysRewrite, @@ -425,76 +424,72 @@ class MklLayoutRewritePass : public GraphOptimizationPass { rinfo_.push_back( {csinfo_.depthwise_conv2d_grad_input, mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d_grad_input), - CopyAttrsConv2DDepthwise, AlwaysRewrite, - kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.depthwise_conv2d_grad_filter, mkl_op_registry::GetMklOpName(csinfo_.depthwise_conv2d_grad_filter), - CopyAttrsConv2DDepthwise, AlwaysRewrite, - kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.dequantize, mkl_op_registry::GetMklOpName(csinfo_.dequantize), - CopyAttrsDequantize, DequantizeRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, DequantizeRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.fused_batch_norm, mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm), - CopyAttrsFusedBatchNorm, AlwaysRewrite, + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.fused_batch_norm_grad, mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad), - CopyAttrsFusedBatchNorm, AlwaysRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.fused_batch_norm_v2, mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_v2), - CopyAttrsFusedBatchNormV2, AlwaysRewrite, - kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.fused_batch_norm_grad_v2, mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad_v2), - CopyAttrsFusedBatchNormV2, AlwaysRewrite, - kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.fused_conv2d, csinfo_.mkl_fused_conv2d, CopyAttrsFusedConv2D, FusedConv2DRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.identity, mkl_op_registry::GetMklOpName(csinfo_.identity), - CopyAttrsDataType, AlwaysRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.lrn, mkl_op_registry::GetMklOpName(csinfo_.lrn), - CopyAttrsLRN, LrnRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, LrnRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.lrn_grad, mkl_op_registry::GetMklOpName(csinfo_.lrn_grad), - CopyAttrsLRN, LrnGradRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, LrnGradRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.matmul, mkl_op_registry::GetMklOpName(csinfo_.matmul), - CopyAttrsMatMul, AlwaysRewrite, kRewriteForOpNameChange}); + CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange}); rinfo_.push_back( {csinfo_.leakyrelu, mkl_op_registry::GetMklOpName(csinfo_.leakyrelu), - CopyAttrsLeakyRelu, LeakyReluRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, LeakyReluRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.leakyrelu_grad, mkl_op_registry::GetMklOpName(csinfo_.leakyrelu_grad), - CopyAttrsLeakyRelu, LeakyReluRewrite, + CopyAttrsAll, LeakyReluRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.max_pool, mkl_op_registry::GetMklOpName(csinfo_.max_pool), - CopyAttrsPooling, NonDepthBatchWisePoolRewrite, + CopyAttrsAll, NonDepthBatchWisePoolRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.max_pool_grad, mkl_op_registry::GetMklOpName(csinfo_.max_pool_grad), - CopyAttrsPooling, MaxpoolGradRewrite, + CopyAttrsAll, MaxpoolGradRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.max_pool3d, mkl_op_registry::GetMklOpName(csinfo_.max_pool3d), - CopyAttrsPooling, NonDepthBatchWisePoolRewrite, + CopyAttrsAll, NonDepthBatchWisePoolRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.max_pool3d_grad, mkl_op_registry::GetMklOpName(csinfo_.max_pool3d_grad), - CopyAttrsPooling, AlwaysRewrite, + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.maximum, mkl_op_registry::GetMklOpName(csinfo_.maximum), - CopyAttrsDataType, AlwaysRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul), - CopyAttrsDataType, AlwaysRewrite, + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.pad_with_conv2d, csinfo_.mkl_pad_with_conv2d, CopyAttrsPadWithConv2D, AlwaysRewrite, @@ -505,11 +500,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass { kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.quantized_avg_pool, mkl_op_registry::GetMklOpName(csinfo_.quantized_avg_pool), - CopyAttrsQuantizedPooling, AlwaysRewrite, + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.quantized_concatv2, mkl_op_registry::GetMklOpName(csinfo_.quantized_concatv2), - CopyAttrsConcatV2, AlwaysRewrite, + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.quantized_conv2d, mkl_op_registry::GetMklOpName(csinfo_.quantized_conv2d), @@ -558,7 +553,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.quantized_max_pool, mkl_op_registry::GetMklOpName(csinfo_.quantized_max_pool), - CopyAttrsQuantizedPooling, AlwaysRewrite, + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.quantized_conv2d_with_bias_sum_and_relu, mkl_op_registry::GetMklOpName( @@ -615,55 +610,55 @@ class MklLayoutRewritePass : public GraphOptimizationPass { kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.quantize_v2, mkl_op_registry::GetMklOpName(csinfo_.quantize_v2), - CopyAttrsQuantizeV2, QuantizeOpRewrite, + CopyAttrsAll, QuantizeOpRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.relu, mkl_op_registry::GetMklOpName(csinfo_.relu), - CopyAttrsDataType, AlwaysRewrite, + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.relu_grad, mkl_op_registry::GetMklOpName(csinfo_.relu_grad), - CopyAttrsDataType, AlwaysRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.relu6, mkl_op_registry::GetMklOpName(csinfo_.relu6), - CopyAttrsDataType, AlwaysRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.relu6_grad, mkl_op_registry::GetMklOpName(csinfo_.relu6_grad), - CopyAttrsDataType, AlwaysRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.requantize, mkl_op_registry::GetMklOpName(csinfo_.requantize), - CopyAttrsRequantize, AlwaysRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); // Disable these two MKL operators for now due to some test failures caused // by these two ops /* rinfo_.push_back({csinfo_.tanh, mkl_op_registry::GetMklOpName(csinfo_.tanh), - CopyAttrsDataType, AlwaysRewrite, + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.tanh_grad, mkl_op_registry::GetMklOpName(csinfo_.tanh_grad), - CopyAttrsDataType, AlwaysRewrite, + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); */ rinfo_.push_back( {csinfo_.reshape, mkl_op_registry::GetMklOpName(csinfo_.reshape), - CopyAttrsReshape, AlwaysRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.slice, mkl_op_registry::GetMklOpName(csinfo_.slice), - CopyAttrsSlice, AlwaysRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back( {csinfo_.softmax, mkl_op_registry::GetMklOpName(csinfo_.softmax), - CopyAttrsDataType, AlwaysRewrite, kRewriteForLayoutPropagation}); + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.squared_difference, mkl_op_registry::GetMklOpName(csinfo_.squared_difference), - CopyAttrsDataType, AlwaysRewrite, + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); rinfo_.push_back({csinfo_.sub, mkl_op_registry::GetMklOpName(csinfo_.sub), - CopyAttrsDataType, AlwaysRewrite, + CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation}); - rinfo_.push_back( - {csinfo_.transpose, mkl_op_registry::GetMklOpName(csinfo_.transpose), - CopyAttrsTranspose, AlwaysRewrite, kRewriteForOpNameChange}); + rinfo_.push_back({csinfo_.transpose, + mkl_op_registry::GetMklOpName(csinfo_.transpose), + CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange}); // Add info about which ops to add workspace edge to and the slots. wsinfo_.push_back({csinfo_.lrn, csinfo_.lrn_grad, 0, 2, 1, 3}); @@ -1739,41 +1734,17 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // We need operator-specific function to copy attributes because the framework // does not provide any generic function for it. // NOTE: names are alphabetically sorted. - static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsBatchMatMul(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); + static void CopyAttrsAll(const Node* orig_node, NodeBuilder* nb, + bool change_format = false); static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb, bool change_format = false); - static void CopyAttrsConv2DDepthwise(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); static void CopyAttrsConv2DDepthwiseCheckConstFilter( const Node* orig_node, NodeBuilder* nb, bool change_format = false); static void CopyAttrsConvCheckConstFilter(const Node* orig_node, NodeBuilder* nb, bool change_format = false); - static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsDequantize(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsFusedBatchNormV2(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsLeakyRelu(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); static void CopyAttrsFusedConv2D(const Node* orig_node, NodeBuilder* nb, bool change_format = false); - static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsMatMul(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); static void CopyAttrsPadWithConv2D(const Node* orig_node, NodeBuilder* nb, bool change_format = false); static void CopyAttrsPadWithFusedConv2D(const Node* orig_node, @@ -1786,26 +1757,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass { const Node* orig_node2, NodeBuilder* nb, bool change_format = false); - static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsQuantizedPooling(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); static void CopyAttrsQuantizedConv2D(const Node* orig_node, NodeBuilder* nb, bool change_format = false); - static void CopyAttrsQuantizedConcat(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsQuantizeV2(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsRequantize(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsSlice(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); - static void CopyAttrsTranspose(const Node* orig_node, NodeBuilder* nb, - bool change_format = false); static void CopyFormatAttrsConv(const Node* orig_node, NodeBuilder* nb, const std::vector& strides, const std::vector& dilations, @@ -2355,6 +2308,21 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( // Op-specific functions to copy attributes from old node to new node ////////////////////////////////////////////////////////////////////////// +// Generic function to copy all attributes from original node to target. +void MklLayoutRewritePass::CopyAttrsAll(const Node* orig_node, NodeBuilder* nb, + bool change_format) { + string name; + AttrSlice attr_list(orig_node->def()); + + auto iter = attr_list.begin(); + while (iter != attr_list.end()) { + name = iter->first; + auto attr = iter->second; + nb->Attr(name, attr); + iter++; + } +} + void MklLayoutRewritePass::CopyAttrsConvCheckConstFilter(const Node* orig_node, NodeBuilder* nb, bool change_format) { @@ -2381,23 +2349,6 @@ void MklLayoutRewritePass::CopyAttrsConvCheckConstFilter(const Node* orig_node, CopyFormatAttrsConv(orig_node, nb, strides, dilations, change_format); } -void MklLayoutRewritePass::CopyAttrsQuantizeV2(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType T; - string mode; - string round_mode; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "mode", &mode)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "round_mode", &round_mode)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("mode", mode); - nb->Attr("round_mode", round_mode); -} void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node, NodeBuilder* nb, bool change_format) { DataType T; @@ -2419,21 +2370,6 @@ void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node, NodeBuilder* nb, CopyFormatAttrsConv(orig_node, nb, strides, dilations, change_format); } -void MklLayoutRewritePass::CopyAttrsDequantize(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType T; - string mode; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "mode", &mode)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("mode", mode); -} - // Used in rinfo when replacing __MklDummyPadWithConv2D by _MklPadWithConv2D void MklLayoutRewritePass::CopyAttrsPadWithConv2D(const Node* orig_node, NodeBuilder* nb, @@ -2558,30 +2494,6 @@ void MklLayoutRewritePass::CopyAttrsFromPadAndFusedConv2D( nb->Attr("fused_ops", fused_ops); } -void MklLayoutRewritePass::CopyAttrsConv2DDepthwise(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType T; - string data_format; - string padding; - std::vector strides; - std::vector dilations; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("strides", strides); - nb->Attr("dilations", dilations); - nb->Attr("padding", padding); - nb->Attr("data_format", data_format); -} - void MklLayoutRewritePass::CopyAttrsConv2DDepthwiseCheckConstFilter( const Node* orig_node, NodeBuilder* nb, bool change_format) { DataType T; @@ -2609,131 +2521,6 @@ void MklLayoutRewritePass::CopyAttrsConv2DDepthwiseCheckConstFilter( nb->Attr("data_format", data_format); } -void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb, - bool change_format) { - DataType T; - int N; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("N", N); -} - -void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType T; - string data_format; - std::vector strides; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("strides", strides); - nb->Attr("data_format", data_format); -} - -void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb, - bool change_format) { - DataType T; - int depth_radius; - float bias; - float alpha; - float beta; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "depth_radius", &depth_radius)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "bias", &bias)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "alpha", &alpha)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "beta", &beta)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("depth_radius", depth_radius); - nb->Attr("bias", bias); - nb->Attr("alpha", alpha); - nb->Attr("beta", beta); -} - -void MklLayoutRewritePass::CopyAttrsLeakyRelu(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType T; - float alpha; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "alpha", &alpha)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("alpha", alpha); -} - -void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType T; - string data_format; - string padding; - std::vector ksize, strides; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "ksize", &ksize)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("ksize", ksize); - nb->Attr("strides", strides); - nb->Attr("padding", padding); - nb->Attr("data_format", data_format); -} - -void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType T; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - - // Add attributes to new node. - nb->Attr("T", T); -} - -void MklLayoutRewritePass::CopyAttrsQuantizedPooling(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType T; - string padding; - std::vector ksize, strides; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "ksize", &ksize)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("ksize", ksize); - nb->Attr("strides", strides); - nb->Attr("padding", padding); -} - void MklLayoutRewritePass::CopyAttrsQuantizedConv2D(const Node* orig_node, NodeBuilder* nb, bool change_format) { @@ -2798,66 +2585,6 @@ void MklLayoutRewritePass::CopyAttrsQuantizedMatMulWithBias( if (bias_status.ToString() == "OK") nb->Attr("Tbias", Tbias); } -void MklLayoutRewritePass::CopyAttrsRequantize(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType Tinput, out_type; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tinput", &Tinput)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "out_type", &out_type)); - - // Add attributes to new node. - nb->Attr("Tinput", Tinput); - nb->Attr("out_type", out_type); -} - -void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType T; - DataType Tshape; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tshape", &Tshape)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("Tshape", Tshape); -} - -void MklLayoutRewritePass::CopyAttrsSlice(const Node* orig_node, - NodeBuilder* nb, bool change_format) { - DataType T; - DataType Index; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Index", &Index)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("Index", Index); -} - -void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node, - NodeBuilder* nb, bool change_format) { - DataType T; - string data_format; - int num_split; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "num_split", &num_split)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("num_split", num_split); - nb->Attr("data_format", data_format); -} - void MklLayoutRewritePass::CopyFormatAttrsConv( const Node* orig_node, NodeBuilder* nb, const std::vector& strides, const std::vector& dilations, bool change_format) { @@ -2897,70 +2624,6 @@ void MklLayoutRewritePass::CopyFormatAttrsConv( } } -void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType T; - int N; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("N", N); -} - -void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType T; - int N; - DataType tidx; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "N", &N)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tidx", &tidx)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("N", N); - nb->Attr("Tidx", tidx); -} - -void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType T; - float epsilon; - string data_format; - bool is_training; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "is_training", &is_training)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("epsilon", epsilon); - nb->Attr("data_format", data_format); - nb->Attr("is_training", is_training); -} - -void MklLayoutRewritePass::CopyAttrsFusedBatchNormV2(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - CopyAttrsFusedBatchNorm(orig_node, nb, change_format); - - DataType U; - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "U", &U)); - nb->Attr("U", U); -} - void MklLayoutRewritePass::CopyAttrsFusedConv2D(const Node* orig_node, NodeBuilder* nb, bool change_format) { @@ -2998,54 +2661,6 @@ void MklLayoutRewritePass::CopyAttrsFusedConv2D(const Node* orig_node, nb->Attr("epsilon", epsilon); } -void MklLayoutRewritePass::CopyAttrsMatMul(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType T; - bool transpose_a, transpose_b; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "transpose_a", &transpose_a)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "transpose_b", &transpose_b)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("transpose_a", transpose_a); - nb->Attr("transpose_b", transpose_b); -} - -void MklLayoutRewritePass::CopyAttrsTranspose(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType T, Tperm; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tperm", &Tperm)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("Tperm", Tperm); -} - -void MklLayoutRewritePass::CopyAttrsBatchMatMul(const Node* orig_node, - NodeBuilder* nb, - bool change_format) { - DataType T; - bool adj_x, adj_y; - - // Get all attributes from old node. - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "adj_x", &adj_x)); - TF_CHECK_OK(GetNodeAttr(orig_node->def(), "adj_y", &adj_y)); - - // Add attributes to new node. - nb->Attr("T", T); - nb->Attr("adj_x", adj_x); - nb->Attr("adj_y", adj_y); -} - ////////////////////////////////////////////////////////////////////////// // Helper functions related to node merge pass ////////////////////////////////////////////////////////////////////////// From f16662adca0def007da642fbc512affed0f4824d Mon Sep 17 00:00:00 2001 From: Mahmoud Abuzaina Date: Wed, 3 Jul 2019 23:45:01 -0700 Subject: [PATCH 0062/3053] Enabling MKL Conv2D FWD in eager mode --- tensorflow/core/common_runtime/eager/BUILD | 14 +- .../eager/mkl_eager_op_rewrite.cc | 185 ++++++++++++++++ tensorflow/core/graph/mkl_graph_util.h | 8 + tensorflow/core/kernels/mkl_conv_ops.cc | 204 +++++++++++------- tensorflow/core/ops/nn_ops.cc | 31 ++- tensorflow/core/util/mkl_util.h | 93 ++++---- 6 files changed, 404 insertions(+), 131 deletions(-) create mode 100644 tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD index 5d5c93130dc..92f9f14d1c9 100644 --- a/tensorflow/core/common_runtime/eager/BUILD +++ b/tensorflow/core/common_runtime/eager/BUILD @@ -3,6 +3,11 @@ load( "tf_cc_test", "tf_cuda_library", ) +load( + "//third_party/mkl:build_defs.bzl", + "if_mkl", + "mkl_deps", +) package( default_visibility = [ @@ -262,7 +267,14 @@ cc_library( "//tensorflow/core/distributed_runtime/eager:eager_client", "//tensorflow/core/distributed_runtime/eager:remote_execute_node", ], - }), + }) + if_mkl(["mkl_eager_op_rewrite"]), +) + +cc_library( + name = "mkl_eager_op_rewrite", + srcs = if_mkl(["mkl_eager_op_rewrite.cc"]), + copts = if_mkl(["-DINTEL_MKL=1"]), + deps = [":eager_op_rewrite_registry"], ) cc_library( diff --git a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc new file mode 100644 index 00000000000..2fbc3dfcc84 --- /dev/null +++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc @@ -0,0 +1,185 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifdef INTEL_MKL +#include "tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h" +#include "tensorflow/core/graph/mkl_graph_util.h" +#include "tensorflow/core/graph/mkl_layout_pass.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/util/mkl_util.h" +#include "tensorflow/core/util/util.h" + +namespace tensorflow { + +class MklEagerOpRewrite : public EagerOpRewrite { + public: + MklEagerOpRewrite(string name, string file, string line); + typedef struct { + string op_name; + std::function RewriteRule; + std::function*)> + CreateMklOp; + } MklEagerOp; + + private: + std::vector mkl_eager_ops; + + // The entry point to execute the op rewrite. + Status Run(EagerOperation* orig_op, + std::unique_ptr* out_op); + + // Initializes the new op and sets up its inputs and attributes + static Status SetupNewOp(EagerOperation* orig_op, const string mkl_op_name, + std::unique_ptr* new_mkl_op); + + // Creates new MKL op for Conv2D, Conv2DBackpropInput and + // Conv2DBackpropFilter. + static Status CreateMklConv2DOp( + EagerOperation* orig_op, std::unique_ptr* mkl_conv2d_op); + + // Rewrite rule for Conv2D, Conv2DBackpropInput and Conv2DBackpropFilter. + static bool RewriteConv2D(EagerOperation* op); + + // Calls op-specific rewrite function to create new MKL op. + Status RewriteToMklOp(EagerOperation* orig_op, + std::unique_ptr* mkl_op, + const int op_idx); + + // Checks whether we can rewrite the op to MKL one or not. + bool ShouldRewriteOp(EagerOperation* op, int* op_idx); +}; + +const EagerOpRewriteRegistry::Phase kMklEagerOpRewritePhase = + EagerOpRewriteRegistry::PRE_EXECUTION; +REGISTER_REWRITE(kMklEagerOpRewritePhase, MklEagerOpRewrite); + +// Constructor +MklEagerOpRewrite::MklEagerOpRewrite(string name, string file, string line) + : EagerOpRewrite(name, file, line) { + mkl_eager_ops.push_back({"Conv2D", RewriteConv2D, CreateMklConv2DOp}); + mkl_eager_ops.push_back( + {"Conv2DBackpropInput", RewriteConv2D, CreateMklConv2DOp}); + mkl_eager_ops.push_back( + {"Conv2DBackpropFilter", RewriteConv2D, CreateMklConv2DOp}); +} + +Status MklEagerOpRewrite::Run( + EagerOperation* orig_op, + std::unique_ptr* out_op) { + int found_op_idx = -1; + if (ShouldRewriteOp(orig_op, &found_op_idx)) { + TF_CHECK_OK(RewriteToMklOp(orig_op, out_op, found_op_idx)); + } + return Status::OK(); +} + +Status MklEagerOpRewrite::SetupNewOp( + EagerOperation* orig_op, const string mkl_op_name, + std::unique_ptr* new_mkl_op) { + const tensorflow::AttrTypeMap* types; + bool is_function = false; + TF_RETURN_IF_ERROR( + tensorflow::AttrTypeMapForOp(mkl_op_name.c_str(), &types, &is_function)); + EagerContext* ctx = orig_op->EagerContext(); + new_mkl_op->reset(new tensorflow::EagerOperation(ctx, mkl_op_name.c_str(), + is_function, types)); + + int num_inputs = orig_op->Inputs().size(); + // Add all inputs to the new op. + for (int i = 0; i < num_inputs; ++i) { + (*new_mkl_op)->AddInput(orig_op->Inputs()[i]); + } + + // Copy all attributes to the new op. + string name; + const NodeDef& orig_ndef = orig_op->MutableAttrs()->BuildNodeDef(); + + AttrSlice attr_list(orig_ndef); + auto iter = attr_list.begin(); + while (iter != attr_list.end()) { + name = iter->first; + auto attr = iter->second; + (*new_mkl_op)->MutableAttrs()->Set(name, attr); + iter++; + } + (*new_mkl_op) + ->MutableAttrs() + ->Set("_kernel", mkl_op_registry::kMklNameChangeOpLabel); + + if (orig_op->Device() != nullptr) { + (*new_mkl_op)->SetDevice(orig_op->Device()); + } else { + const char* device_name = + DeviceNameUtils::ParsedNameToString(orig_op->GetDeviceName()).c_str(); + (*new_mkl_op)->SetDeviceName(device_name); + } + return Status::OK(); +} + +Status MklEagerOpRewrite::CreateMklConv2DOp( + EagerOperation* orig_op, std::unique_ptr* mkl_conv2d_op) { + const string mkl_op_name = + mkl_op_registry::GetMklEagerOpName(orig_op->Name()); + TF_CHECK_OK(SetupNewOp(orig_op, mkl_op_name, mkl_conv2d_op)); + return Status::OK(); +} + +bool MklEagerOpRewrite::ShouldRewriteOp(EagerOperation* op, int* op_idx) { + // Don't rewrite the op if MKL use is disabled at runtime. + if (DisableMKL()) { + return false; + } + DataType T; + if (op->Attrs().Get("T", &T) != Status::OK()) { + return false; + } + // Check if we have registered MKL kernel for this op. + if (!mkl_op_registry::IsMklNameChangeOp( + mkl_op_registry::GetMklEagerOpName(op->Name()), T) && + !mkl_op_registry::IsMklNameChangeOp( + mkl_op_registry::GetMklOpName(op->Name()), T)) { + return false; + } + + bool result = false; + *op_idx = -1; + // Find and call the op's rewrite rule that determines whether we need to + // rewrite this op or not. + for (auto it = mkl_eager_ops.begin(); it != mkl_eager_ops.end(); ++it) { + if (it->op_name.compare(op->Name()) == 0 && it->RewriteRule(op)) { + *op_idx = it - mkl_eager_ops.begin(); + result = true; + break; + } + } + return result; +} + +Status MklEagerOpRewrite::RewriteToMklOp( + EagerOperation* orig_op, std::unique_ptr* mkl_op, + const int op_idx) { + mkl_eager_ops[op_idx].CreateMklOp(orig_op, mkl_op); + return Status::OK(); +} + +bool MklEagerOpRewrite::RewriteConv2D(EagerOperation* op) { + const NodeDef& ndef = op->MutableAttrs()->BuildNodeDef(); + string padding; + TF_CHECK_OK(GetNodeAttr(ndef, "padding", &padding)); + // Right now MKL Conv2D does not support explicit padding. + return padding == "EXPLICIT" ? false : true; +} + +} // namespace tensorflow +#endif // INTEL_MKL diff --git a/tensorflow/core/graph/mkl_graph_util.h b/tensorflow/core/graph/mkl_graph_util.h index c204dd0ffcf..56264694eba 100644 --- a/tensorflow/core/graph/mkl_graph_util.h +++ b/tensorflow/core/graph/mkl_graph_util.h @@ -104,12 +104,20 @@ static const char* kMklQuantizedOpLabelPattern = "label='QuantizedMklOp'"; // Prefix that we add to Tensorflow op name to construct Mkl op name. static const char* const kMklOpPrefix = "_Mkl"; +static const char* const kMklEagerOpPrefix = "_MklEager"; // Get the name of Mkl op from original TensorFlow op // We prefix 'Mkl' to the original op to get Mkl op. inline string GetMklOpName(const string& name) { return string(kMklOpPrefix) + name; } + +// Get the name of Mkl Eager op from original TensorFlow op +// We prefix 'MklEager' to the original op to get Mkl Eager op. +inline string GetMklEagerOpName(const string& name) { + return string(kMklEagerOpPrefix) + name; +} + // Check whether opname with type T is registered as MKL operator // that can accept input tensors in MKL layout. // diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index 14344da0560..35ef59b0b1f 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -24,8 +24,8 @@ limitations under the License. #include #include -#include "mkldnn.hpp" #include "absl/strings/str_join.h" +#include "mkldnn.hpp" #include "tensorflow/core/framework/bounds_check.h" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -401,7 +401,8 @@ typedef Eigen::ThreadPoolDevice CPUDevice; // Base class for convolution forward operations template + bool bias_enabled, bool pad_enabled, bool is_depthwise, + bool eager_mode> class MklConvOp : public OpKernel { public: ~MklConvOp() {} @@ -428,8 +429,10 @@ class MklConvOp : public OpKernel { "strides in the batch and depth dimensions.")); OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); is_filter_const_ = false; - OP_REQUIRES_OK(context, - context->GetAttr("is_filter_const", &is_filter_const_)); + if (context->HasAttr("is_filter_const")) { + OP_REQUIRES_OK(context, + context->GetAttr("is_filter_const", &is_filter_const_)); + } if (strides_.size() == 4) { OP_REQUIRES(context, dilations_.size() == 4, @@ -450,17 +453,15 @@ class MklConvOp : public OpKernel { OP_REQUIRES(context, dilations_.size() == 5, errors::InvalidArgument("Dilation rates field must " "specify 5 dimensions")); - OP_REQUIRES(context, - (GetTensorDim(dilations_, data_format_, 'N') == 1 && - GetTensorDim(dilations_, data_format_, 'C') == 1), + OP_REQUIRES(context, (GetTensorDim(dilations_, data_format_, 'N') == 1 && + GetTensorDim(dilations_, data_format_, 'C') == 1), errors::InvalidArgument( "Current implementation does not yet support " "dilations rates in the batch and depth dimensions.")); OP_REQUIRES( - context, - (GetTensorDim(dilations_, data_format_, '0') > 0 && - GetTensorDim(dilations_, data_format_, '1') > 0 && - GetTensorDim(dilations_, data_format_, '2') > 0), + context, (GetTensorDim(dilations_, data_format_, '0') > 0 && + GetTensorDim(dilations_, data_format_, '1') > 0 && + GetTensorDim(dilations_, data_format_, '2') > 0), errors::InvalidArgument("Dilated rates should be larger than 0.")); } } @@ -472,8 +473,9 @@ class MklConvOp : public OpKernel { const Tensor& filter_tensor = MklGetInput(context, kInputIndex_Filter); MklDnnShape src_mkl_shape, filter_mkl_shape; - GetMklShape(context, kInputIndex_Src, &src_mkl_shape); - GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape); + GetMklShape(context, kInputIndex_Src, &src_mkl_shape, eager_mode); + GetMklShape(context, kInputIndex_Filter, &filter_mkl_shape, eager_mode); + OP_REQUIRES(context, filter_mkl_shape.IsMklTensor() == false, errors::InvalidArgument("Filter should not be in " "Mkl Layout")); @@ -503,8 +505,9 @@ class MklConvOp : public OpKernel { // Get shapes of input tensors in MKL-DNN order MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_, dilations_); - auto src_tf_shape = GetTfShape(context, kInputIndex_Src); - auto filter_tf_shape = GetTfShape(context, kInputIndex_Filter); + auto src_tf_shape = GetTfShape(context, kInputIndex_Src, eager_mode); + auto filter_tf_shape = + GetTfShape(context, kInputIndex_Filter, eager_mode); conv_utl.GetConvFwdSizesInMklOrder( src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides, &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, &padding_left, @@ -517,15 +520,17 @@ class MklConvOp : public OpKernel { // Corner cases: output with 0 elements and 0 batch size. Tensor* dst_tensor = nullptr; + Tensor tmp_tensor; bool emit_filter_output = (typeid(Tinput) == typeid(Tfilter) && typeid(Tinput) == typeid(Toutput) && (typeid(Tinput) == typeid(float) || - typeid(Tinput) == typeid(bfloat16))); + typeid(Tinput) == typeid(bfloat16))) && + !eager_mode; if (dst_tf_shape.num_elements() == 0 || dst_dims_tf_order[0] == 0) { MklDnnShape dst_mkl_shape; dst_mkl_shape.SetMklTensor(false); AllocateOutputSetMklShape(context, kOutputIndex_Dst, &dst_tensor, - src_tf_shape, dst_mkl_shape); + src_tf_shape, dst_mkl_shape, eager_mode); // MklConv2D/3D also outputs converted filter as 2nd output. filter_mkl_shape.SetMklTensor(false); @@ -627,9 +632,10 @@ class MklConvOp : public OpKernel { convFwdDims, do_not_cache); // Allocate output tensors `output_tensor` and `filter_out_tensor` + MklDnnShape output_mkl_shape; std::shared_ptr conv_fwd_pd = conv_fwd->GetPrimitiveDesc(); AllocateOutputTensor(context, *conv_fwd_pd, dst_dims_mkl_order, tf_fmt, - &dst_tensor); + &output_mkl_shape, &dst_tensor, &tmp_tensor); Tensor* filter_out_tensor = nullptr; if (emit_filter_output) { @@ -695,7 +701,28 @@ class MklConvOp : public OpKernel { this->GetBiasHandle(context, conv_fwd_pd, bias_tensor); conv_fwd->Execute(src_data, filter_data, bias_data, dst_data); } else { - conv_fwd->Execute(src_data, filter_data, dst_data); + if (!eager_mode) { + conv_fwd->Execute(src_data, filter_data, dst_data); + } else { + // In eager mode we first write the output to temporary + // buffer in MKL format. Then we convert the data to TF format. + Ttemp_output* tmp_data = reinterpret_cast( + tmp_tensor.flat().data()); + conv_fwd->Execute(src_data, filter_data, tmp_data); + + // Now we need to convert the output to TF format. + auto output_tf_md = output_mkl_shape.GetTfLayout(); + auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine_); + auto dst_pd = (*conv_fwd_pd).dst_primitive_desc(); + mkldnn::reorder::primitive_desc reorder_pd = + mkldnn::reorder::primitive_desc(dst_pd, output_tf_pd); + std::vector net; + memory* tmp_data_mem = new memory(dst_pd, tmp_data); + memory* dst_data_mem = new memory(output_tf_pd, dst_data); + net.push_back( + mkldnn::reorder(reorder_pd, *tmp_data_mem, *dst_data_mem)); + stream(stream::kind::eager).submit(net).wait(); + } } // Delete primitive since it is not cached. @@ -809,7 +836,9 @@ class MklConvOp : public OpKernel { const ConvFwdPd& conv_prim_desc, const memory::dims& output_dims_mkl_order, memory::format output_tf_format, - Tensor** output_tensor) { + MklDnnShape* output_mkl_shape, + Tensor** output_tensor, + Tensor* tmp_tensor) { CHECK_NOTNULL(output_tensor); auto dst_pd = conv_prim_desc.dst_primitive_desc(); @@ -820,33 +849,36 @@ class MklConvOp : public OpKernel { dst_pd = memory::primitive_desc(dst_md, cpu_engine_); } // Allocate shape of Mkl tensor. - MklDnnShape output_mkl_shape; - output_mkl_shape.SetMklTensor(true); - output_mkl_shape.SetMklLayout(&dst_pd); - output_mkl_shape.SetElemType(MklDnnType()); - output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), - output_dims_mkl_order, output_tf_format); + output_mkl_shape->SetMklTensor(true); + output_mkl_shape->SetMklLayout(&dst_pd); + output_mkl_shape->SetElemType(MklDnnType()); + output_mkl_shape->SetTfLayout(output_dims_mkl_order.size(), + output_dims_mkl_order, output_tf_format); // Allocate shape of TF tensor. TensorShape output_tf_shape; output_tf_shape.AddDim((dst_pd.get_size() / sizeof(Toutput))); + if (eager_mode) { + AllocTmpBuffer(context, tmp_tensor, output_tf_shape); + output_tf_shape = output_mkl_shape->GetTfShape(); + } AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor, - output_tf_shape, output_mkl_shape); + output_tf_shape, *output_mkl_shape, eager_mode); if (fuse_add_) { const Tensor& add_tensor = MklGetInput(context, kInputIndex_Add); MklDnnShape add_mkl_shape; GetMklShape(context, kInputIndex_Add, &add_mkl_shape); // Check if need reorder - if (add_mkl_shape == output_mkl_shape) { + if (add_mkl_shape == *output_mkl_shape) { CHECK((*output_tensor)->CopyFrom(add_tensor, output_tf_shape)); } else { auto add_md = add_mkl_shape.IsMklTensor() ? add_mkl_shape.GetMklLayout() : memory::desc(output_dims_mkl_order, MklDnnType(), - output_mkl_shape.GetTfDataFormat()); + output_mkl_shape->GetTfDataFormat()); auto add_pd = memory::primitive_desc(add_md, this->cpu_engine_); void* add_buf = static_cast( const_cast(add_tensor.flat().data())); @@ -1047,11 +1079,11 @@ template class MklFusedConvOp : public MklConvOp { + Tpadding, false, false, false, false> { public: explicit MklFusedConvOp(OpKernelConstruction* context) : MklConvOp(context) { + Tpadding, false, false, false, false>(context) { // Since we came here through the registration of _MklFusedConv2D, get // all information from 'fused_ops' and 'num_args' std::vector fused_ops; @@ -1143,7 +1175,7 @@ template class MklQuantizedConv2DOp : public MklConvOp { + int32, bias_enabled, false, is_depthwise, false> { public: virtual ~MklQuantizedConv2DOp() { if (this->input_bias_ != nullptr) { @@ -1159,7 +1191,7 @@ class MklQuantizedConv2DOp explicit MklQuantizedConv2DOp(OpKernelConstruction* context) : MklConvOp(context) { + bias_enabled, false, is_depthwise, false>(context) { bool is_filter_const; OP_REQUIRES_OK(context, context->GetAttr("is_filter_const", &is_filter_const)); @@ -1170,7 +1202,7 @@ class MklQuantizedConv2DOp void Compute(OpKernelContext* context) override { // Compute int32 output tensor MklConvOp::Compute(context); + bias_enabled, false, is_depthwise, false>::Compute(context); // Compute additional outputs: min/max scalars. int bias_index_offset; @@ -1232,8 +1264,8 @@ class MklQuantizedConv2DOp void ExtendConvFwdParams(OpKernelContext* context, MklConvFwdParams& params) override { MklConvOp::ExtendConvFwdParams(context, - params); + bias_enabled, false, is_depthwise, + false>::ExtendConvFwdParams(context, params); // When the output type is quint8, the output data id requantized // into quint8. A post_op "output_scale" is added to do the conversion. @@ -1432,7 +1464,9 @@ class MklQuantizedConv2DSumReluOp const ConvFwdPd& conv_prim_desc, const memory::dims& output_dims_mkl_order, memory::format output_tf_format, - Tensor** output_tensor) override { + MklDnnShape* output_mkl_shape, + Tensor** output_tensor, + Tensor* tmp_tensor) override { int summand_idx = context->num_inputs() / 2 - 1; if (std::is_same::value) { summand_idx -= 2; @@ -1459,12 +1493,12 @@ class MklQuantizedConv2DSumReluOp *output_tensor = const_cast(&summand); return; } - MklConvOp::AllocateOutputTensor(context, conv_prim_desc, output_dims_mkl_order, - output_tf_format, output_tensor); + output_tf_format, output_mkl_shape, + output_tensor, tmp_tensor); const Tensor& summand = MklGetInput(context, summand_idx); if (summand.dtype() != DT_FLOAT) TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION, @@ -1870,46 +1904,52 @@ REGISTER_KERNEL_BUILDER( MklQuantizedConv2DReluOp); // Register 2D operations -#define REGISTER_MKL_CPU_2D(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklConv2D") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklConvOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklConv2DWithBias") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklConvOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("__MklDummyConv2DWithBias") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklDummyOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklPadWithConv2D") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .TypeConstraint("Tpaddings") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklConvOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklPadWithConv2D") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .TypeConstraint("Tpaddings") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklConvOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("__MklDummyPadWithConv2D") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .TypeConstraint("Tpaddings") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklDummyOp); +#define REGISTER_MKL_CPU_2D(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklConvOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv2DWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklConvOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("__MklDummyConv2DWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklDummyOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklPadWithConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tpaddings") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklConvOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklPadWithConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tpaddings") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklConvOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("__MklDummyPadWithConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tpaddings") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklDummyOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklEagerConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklNameChangeOpLabel), \ + MklConvOp); TF_CALL_float(REGISTER_MKL_CPU_2D); TF_CALL_bfloat16(REGISTER_MKL_CPU_2D); @@ -1920,7 +1960,7 @@ TF_CALL_bfloat16(REGISTER_MKL_CPU_2D); .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklConvOp); + MklConvOp); TF_CALL_float(REGISTER_MKL_CPU_2D_DEPTHWISE); TF_CALL_bfloat16(REGISTER_MKL_CPU_2D_DEPTHWISE); @@ -1966,7 +2006,7 @@ TF_CALL_bfloat16(REGISTER_MKL_CPU_2D_FUSED); .Device(DEVICE_CPU) \ .TypeConstraint("T") \ .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklConvOp); + MklConvOp); TF_CALL_float(REGISTER_MKL_CPU_3D); TF_CALL_bfloat16(REGISTER_MKL_CPU_3D); diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index a55dde64e1b..3354a403125 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -1281,9 +1281,9 @@ Status TopKShapeFn(InferenceContext* c) { DimensionHandle last_dim = c->Dim(input, -1); if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) && c->Value(last_dim) < c->Value(k_dim)) { - return errors::InvalidArgument( - "input must have last dimension >= k = ", c->Value(k_dim), " but is ", - c->Value(last_dim)); + return errors::InvalidArgument("input must have last dimension >= k = ", + c->Value(k_dim), " but is ", + c->Value(last_dim)); } // Replace last_dim with k_dim. @@ -1337,9 +1337,9 @@ REGISTER_OP("NthElement") DimensionHandle last_dim = c->Dim(input, -1); if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) && c->Value(last_dim) <= c->Value(n_dim)) { - return errors::InvalidArgument( - "Input must have last dimension > n = ", c->Value(n_dim), - " but is ", c->Value(last_dim)); + return errors::InvalidArgument("Input must have last dimension > n = ", + c->Value(n_dim), " but is ", + c->Value(last_dim)); } // Reduce last_dim for output tensor @@ -1652,6 +1652,25 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is expected to invoke these operators. )doc"); +REGISTER_OP("_MklEagerConv2D") + .Input("input: T") + .Input("filter: T") + .Output("output: T") + .Attr("T: {bfloat16, float}") + .Attr("strides: list(int)") + .Attr("use_cudnn_on_gpu: bool = true") + .Attr(GetPaddingAttrStringWithExplicit()) + .Attr(GetExplicitPaddingsAttrString()) + .Attr(GetConvnetDataFormatAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn(shape_inference::Conv2DShapeWithExplicitPadding) + .Doc(R"doc( + MKL version of Conv2D operator for Eager mode. Uses MKL DNN APIs to perform 2D convolution. + + NOTE Do not invoke this operator directly in Python. Eager Op rewrite is + expected to invoke these operators. + )doc"); + REGISTER_OP("__MklDummyConv2DWithBias") .Input("input: T") .Input("filter: T") diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index d94c4f23ef9..6deb785238c 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -295,32 +295,32 @@ class MklShape { CHECK_EQ(dnnDelete_F32(convert), E_SUCCESS); } - // The following methods are used for serializing and de-serializing the - // contents of the mklshape object. - // The data is serialized in this order - // isMklTensor_ - // dimension_ - // sizes_ - // strides_ - // mklLayout_ - // tfLayout_ - // tf_to_mkl_dim_map_ +// The following methods are used for serializing and de-serializing the +// contents of the mklshape object. +// The data is serialized in this order +// isMklTensor_ +// dimension_ +// sizes_ +// strides_ +// mklLayout_ +// tfLayout_ +// tf_to_mkl_dim_map_ #define SIZE_OF_MKL_DNN_BUF \ (dnnLayoutSerializationBufferSize_F32()) // Size of buffer needed to // serialize dnn_layout pointer - // Size of buffer to hold the serialized object, the size is computed as - // follows sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) + - // sizeof(strides_) - // + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer) - // + sizeof(tf_to_mkl_dim_map_) +// Size of buffer to hold the serialized object, the size is computed as +// follows sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) + +// sizeof(strides_) +// + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer) +// + sizeof(tf_to_mkl_dim_map_) #define SIZE_OF_MKL_SERIAL_DATA(dims) \ (2 * sizeof(size_t) + 3 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF) - // First we need to define some macro for offsets into the serial buffer where - // different elements of Mklshape is written/read from +// First we need to define some macro for offsets into the serial buffer where +// different elements of Mklshape is written/read from #define IS_MKL_TENSOR_OFFSET 0 // Location from start of buffer where isMklTensor_ is serialized @@ -880,9 +880,9 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor, CHECK(output_tensor.CopyFrom(mkl_tensor, output_shape)); } } catch (mkldnn::error& e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + ", in file " + - string(__FILE__) + ":" + std::to_string(__LINE__); + string error_msg = "Status: " + std::to_string(e.status) + ", message: " + + string(e.message) + ", in file " + string(__FILE__) + + ":" + std::to_string(__LINE__); LOG(FATAL) << "Operation received an exception: " << error_msg; } return output_tensor; @@ -902,15 +902,20 @@ inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) { sizeof(uint8)); } #else -inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape) { - mklshape->DeSerializeMklDnnShape( - ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs())) - .flat() - .data(), - ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs())) - .flat() - .size() * - sizeof(uint8)); +inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape, + bool eager_mode = false) { + if (!eager_mode) { + mklshape->DeSerializeMklDnnShape( + ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs())) + .flat() + .data(), + ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs())) + .flat() + .size() * + sizeof(uint8)); + } else { + mklshape->SetMklTensor(false); + } } #endif @@ -959,14 +964,15 @@ inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name, /// Get shape of input tensor pointed by 'input_idx' in TensorShape format. /// If the input tensor is in MKL layout, then obtains TensorShape from /// MklShape. -inline TensorShape GetTfShape(OpKernelContext* context, size_t input_idx) { +inline TensorShape GetTfShape(OpKernelContext* context, size_t input_idx, + bool eager_mode = false) { // Sanity check. CHECK_NOTNULL(context); CHECK_LT(input_idx, context->num_inputs()); MklDnnShape input_mkl_shape; - GetMklShape(context, input_idx, &input_mkl_shape); - if (input_mkl_shape.IsMklTensor()) { + GetMklShape(context, input_idx, &input_mkl_shape, eager_mode); + if (input_mkl_shape.IsMklTensor() && !eager_mode) { return input_mkl_shape.GetTfShape(); } else { const Tensor& t = MklGetInput(context, input_idx); @@ -1035,19 +1041,22 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, Tensor** output, const TensorShape& tf_shape, - const MklDnnShape& mkl_shape) { - Tensor* second_tensor = nullptr; - TensorShape second_shape; - second_shape.AddDim(mkl_shape.GetSerializeBufferSize()); + const MklDnnShape& mkl_shape, + bool eager_mode = false) { OP_REQUIRES_OK( ctext, ctext->allocate_output(GetTensorDataIndex(n, ctext->num_outputs()), tf_shape, output)); - OP_REQUIRES_OK(ctext, ctext->allocate_output( - GetTensorMetaDataIndex(n, ctext->num_outputs()), - second_shape, &second_tensor)); - mkl_shape.SerializeMklDnnShape( - second_tensor->flat().data(), - second_tensor->flat().size() * sizeof(uint8)); + if (!eager_mode) { + Tensor* second_tensor = nullptr; + TensorShape second_shape; + second_shape.AddDim(mkl_shape.GetSerializeBufferSize()); + OP_REQUIRES_OK(ctext, ctext->allocate_output( + GetTensorMetaDataIndex(n, ctext->num_outputs()), + second_shape, &second_tensor)); + mkl_shape.SerializeMklDnnShape( + second_tensor->flat().data(), + second_tensor->flat().size() * sizeof(uint8)); + } } #endif From 352fe1c79ae6defa60b5e63f09ac9b8517636b2c Mon Sep 17 00:00:00 2001 From: Mahmoud Abuzaina Date: Wed, 3 Jul 2019 23:49:41 -0700 Subject: [PATCH 0063/3053] Enabling MKL Conv2D BWD in eager mode --- .../core/kernels/mkl_conv_grad_filter_ops.cc | 102 ++++++++++-------- .../core/kernels/mkl_conv_grad_input_ops.cc | 95 ++++++++++------ tensorflow/core/ops/nn_ops.cc | 66 ++++++++++-- 3 files changed, 177 insertions(+), 86 deletions(-) diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc index aa4254de20b..24dd230a7e1 100644 --- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc @@ -357,7 +357,8 @@ class MklConvBwdFilterPrimitiveFactory : public MklPrimitiveFactory { } }; -template +template class MklConvCustomBackpropFilterOp : public MklConvBackpropCommonOp { public: @@ -382,9 +383,9 @@ class MklConvCustomBackpropFilterOp const Tensor& diff_dst_tensor = MklGetInput(context, kOutbpropIdx); MklDnnShape src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape; - GetMklShape(context, kInputIdx, &src_mkl_shape); - GetMklShape(context, kFilterIdx, &filter_mkl_shape); - GetMklShape(context, kOutbpropIdx, &diff_dst_mkl_shape); + GetMklShape(context, kInputIdx, &src_mkl_shape, eager_mode); + GetMklShape(context, kFilterIdx, &filter_mkl_shape, eager_mode); + GetMklShape(context, kOutbpropIdx, &diff_dst_mkl_shape, eager_mode); // Allow operator-specific sanity checking of shapes. ValidateMklShapes(src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape); @@ -395,7 +396,8 @@ class MklConvCustomBackpropFilterOp // allow this class to handle this case. TensorShape src_tf_shape = MakeInputTfShape(context, src_tensor); TensorShape filter_tf_shape = MakeFilterTfShape(context, filter_tensor); - TensorShape diff_dst_tf_shape = GetTfShape(context, kOutbpropIdx); + TensorShape diff_dst_tf_shape = + GetTfShape(context, kOutbpropIdx, eager_mode); // Corner cases: output with 0 elements and 0 batch size. Tensor* diff_filter_tensor = nullptr; @@ -408,7 +410,8 @@ class MklConvCustomBackpropFilterOp GetOutputTfShape(src_tf_shape, filter_tf_shape, diff_dst_tf_shape); const int kOutputIdx = 0; AllocateOutputSetMklShape(context, kOutputIdx, &diff_filter_tensor, - diff_filter_tf_shape, diff_filter_mkl_shape); + diff_filter_tf_shape, diff_filter_mkl_shape, + eager_mode); CHECK_NOTNULL(diff_filter_tensor); // if output tensor has more than 0 elements, we need to 0 them out. @@ -493,8 +496,8 @@ class MklConvCustomBackpropFilterOp bwd_output_dims[MklDnnDims::Dim_I], bwd_output_dims[MklDnnDims::Dim_O]}); AllocateOutputSetMklShape(context, 0, &diff_filter_tensor, - diff_filter_tf_shape, - diff_filter_mkl_shape); + diff_filter_tf_shape, diff_filter_mkl_shape, + eager_mode); } else { // Depthwise Conv2d: bwd_output_dims is GOIHW format // | TensorFlow | MKLDNN @@ -592,9 +595,9 @@ class MklConvCustomBackpropFilterOp // delete primitive since it is not cached. if (do_not_cache) delete conv_bwd_filter; } catch (mkldnn::error& e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + ", in file " + - string(__FILE__) + ":" + std::to_string(__LINE__); + string error_msg = "Status: " + std::to_string(e.status) + ", message: " + + string(e.message) + ", in file " + string(__FILE__) + + ":" + std::to_string(__LINE__); OP_REQUIRES_OK( context, errors::Aborted("Operation received an exception:", error_msg)); @@ -620,7 +623,7 @@ class MklConvCustomBackpropFilterOp TensorShape MakeInputTfShape(OpKernelContext* context, const Tensor& input_tensor) { size_t input_idx = 0; - return GetTfShape(context, input_idx); + return GetTfShape(context, input_idx, eager_mode); } // Get TensorFlow shape of filter tensor. @@ -654,10 +657,9 @@ class MklConvCustomBackpropFilterOp // Output layout is Tensorflow's filter layout // Conv2D: HWIO; Conv3D: DHWIO; Depthwise Conv: HWIGO memory::format GetOutputFormat(const memory::format data_format) { - return is_depthwise - ? memory::format::hwigo - : ((this->strides_.size() == 4) ? memory::format::hwio - : memory::format::dhwio); + return is_depthwise ? memory::format::hwigo : ((this->strides_.size() == 4) + ? memory::format::hwio + : memory::format::dhwio); } // Allocate output tensor. @@ -699,37 +701,43 @@ class MklConvCustomBackpropFilterOp } }; -#define REGISTER_MKL_FILTER_KERNELS(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklConv2DBackpropFilter") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklConvCustomBackpropFilterOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklConv2DBackpropFilterWithBias") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklConvCustomBackpropFilterOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklDepthwiseConv2dNativeBackpropFilter") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklConvCustomBackpropFilterOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("__MklDummyConv2DBackpropFilterWithBias") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklDummyOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklConv3DBackpropFilterV2") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklConvCustomBackpropFilterOp); +#define REGISTER_MKL_FILTER_KERNELS(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv2DBackpropFilter") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklConvCustomBackpropFilterOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklEagerConv2DBackpropFilter") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklNameChangeOpLabel), \ + MklConvCustomBackpropFilterOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv2DBackpropFilterWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklConvCustomBackpropFilterOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklDepthwiseConv2dNativeBackpropFilter") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklConvCustomBackpropFilterOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("__MklDummyConv2DBackpropFilterWithBias") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklDummyOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv3DBackpropFilterV2") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklConvCustomBackpropFilterOp); TF_CALL_float(REGISTER_MKL_FILTER_KERNELS); TF_CALL_bfloat16(REGISTER_MKL_FILTER_KERNELS); diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc index e23e099916a..bed7a752bae 100644 --- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc @@ -295,7 +295,7 @@ class MklConvBwdInputPrimitiveFactory : public MklPrimitiveFactory { } }; -template +template class MklConvCustomBackpropInputOp : public MklConvBackpropCommonOp { public: @@ -319,9 +319,9 @@ class MklConvCustomBackpropInputOp const Tensor& diff_dst_tensor = MklGetInput(context, kOutbpropIdx); MklDnnShape src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape; - GetMklShape(context, kInputIdx, &src_mkl_shape); - GetMklShape(context, kFilterIdx, &filter_mkl_shape); - GetMklShape(context, kOutbpropIdx, &diff_dst_mkl_shape); + GetMklShape(context, kInputIdx, &src_mkl_shape, eager_mode); + GetMklShape(context, kFilterIdx, &filter_mkl_shape, eager_mode); + GetMklShape(context, kOutbpropIdx, &diff_dst_mkl_shape, eager_mode); // Allow operator-specific sanity checking of shapes. ValidateMklShapes(src_mkl_shape, filter_mkl_shape, diff_dst_mkl_shape); @@ -332,10 +332,12 @@ class MklConvCustomBackpropInputOp // allow this class to handle this case. TensorShape src_tf_shape = MakeInputTfShape(context, src_tensor); TensorShape filter_tf_shape = MakeFilterTfShape(context, filter_tensor); - TensorShape diff_dst_tf_shape = GetTfShape(context, kOutbpropIdx); + TensorShape diff_dst_tf_shape = + GetTfShape(context, kOutbpropIdx, eager_mode); // Corner cases: output with 0 elements and 0 batch size. Tensor* diff_src_tensor = nullptr; + Tensor tmp_tensor; if (src_tf_shape.num_elements() == 0 || filter_tf_shape.num_elements() == 0 || diff_dst_tf_shape.num_elements() == 0) { @@ -345,7 +347,8 @@ class MklConvCustomBackpropInputOp GetOutputTfShape(src_tf_shape, filter_tf_shape, diff_dst_tf_shape); const int kOutputIdx = 0; AllocateOutputSetMklShape(context, kOutputIdx, &diff_src_tensor, - diff_src_tf_shape, diff_src_mkl_shape); + diff_src_tf_shape, diff_src_mkl_shape, + eager_mode); CHECK_NOTNULL(diff_src_tensor); // if output tensor has more than 0 elements, we need to 0 them out. @@ -429,9 +432,12 @@ class MklConvCustomBackpropInputOp bwd_diff_src_dims, bwd_diff_src_format); TensorShape diff_src_tf_shape; diff_src_tf_shape.AddDim(diff_src_pd.get_size() / sizeof(T)); + if (eager_mode) { + AllocTmpBuffer(context, &tmp_tensor, diff_src_tf_shape); + diff_src_tf_shape = diff_src_mkl_shape.GetTfShape(); + } AllocateOutputSetMklShape(context, 0, &diff_src_tensor, diff_src_tf_shape, - diff_src_mkl_shape); - + diff_src_mkl_shape, eager_mode); T* diff_src_data = static_cast(const_cast(diff_src_tensor->flat().data())); @@ -458,16 +464,34 @@ class MklConvCustomBackpropInputOp } // execute convolution input bwd - conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data); + if (!eager_mode) { + conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data); + } else { + // In eager mode we first write the output to temporary + // buffer in MKL format. Then we convert the data to TF format. + T* tmp_data = + static_cast(const_cast(tmp_tensor.flat().data())); + conv_bwd_input->Execute(tmp_data, filter_data, diff_dst_data); + auto output_tf_md = diff_src_mkl_shape.GetTfLayout(); + auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine); + mkldnn::reorder::primitive_desc reorder_pd = + mkldnn::reorder::primitive_desc(diff_src_pd, output_tf_pd); + std::vector net; + memory* tmp_data_mem = new memory(diff_src_pd, tmp_data); + memory* dst_data_mem = new memory(output_tf_pd, diff_src_data); + net.push_back( + mkldnn::reorder(reorder_pd, *tmp_data_mem, *dst_data_mem)); + stream(stream::kind::eager).submit(net).wait(); + } // delete primitive since it is not cached. if (do_not_cache) { delete conv_bwd_input; } } catch (mkldnn::error& e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + ", in file " + - string(__FILE__) + ":" + std::to_string(__LINE__); + string error_msg = "Status: " + std::to_string(e.status) + ", message: " + + string(e.message) + ", in file " + string(__FILE__) + + ":" + std::to_string(__LINE__); OP_REQUIRES_OK( context, errors::Aborted("Operation received an exception:", error_msg)); @@ -506,7 +530,7 @@ class MklConvCustomBackpropInputOp // Get TensorFlow shape of filter tensor. TensorShape MakeFilterTfShape(OpKernelContext* context, const Tensor& filter_tensor) { - return GetTfShape(context, kInputIndex_Filter); + return GetTfShape(context, kInputIndex_Filter, eager_mode); } // Get the Tensorflow shape of Output (diff_src), @@ -557,26 +581,31 @@ class MklConvCustomBackpropInputOp } }; -#define REGISTER_MKL_CPU_KERNELS(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklConv2DBackpropInput") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklConvCustomBackpropInputOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklConv3DBackpropInputV2") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklConvCustomBackpropInputOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklDepthwiseConv2dNativeBackpropInput") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklConvCustomBackpropInputOp); - +#define REGISTER_MKL_CPU_KERNELS(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv2DBackpropInput") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklConvCustomBackpropInputOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklEagerConv2DBackpropInput") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklNameChangeOpLabel), \ + MklConvCustomBackpropInputOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklConv3DBackpropInputV2") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklConvCustomBackpropInputOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklDepthwiseConv2dNativeBackpropInput") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklConvCustomBackpropInputOp); TF_CALL_float(REGISTER_MKL_CPU_KERNELS); TF_CALL_bfloat16(REGISTER_MKL_CPU_KERNELS); #undef REGISTER_MKL_CPU_KERNELS diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index a55dde64e1b..330677390d6 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -1281,9 +1281,9 @@ Status TopKShapeFn(InferenceContext* c) { DimensionHandle last_dim = c->Dim(input, -1); if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) && c->Value(last_dim) < c->Value(k_dim)) { - return errors::InvalidArgument( - "input must have last dimension >= k = ", c->Value(k_dim), " but is ", - c->Value(last_dim)); + return errors::InvalidArgument("input must have last dimension >= k = ", + c->Value(k_dim), " but is ", + c->Value(last_dim)); } // Replace last_dim with k_dim. @@ -1337,9 +1337,9 @@ REGISTER_OP("NthElement") DimensionHandle last_dim = c->Dim(input, -1); if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) && c->Value(last_dim) <= c->Value(n_dim)) { - return errors::InvalidArgument( - "Input must have last dimension > n = ", c->Value(n_dim), - " but is ", c->Value(last_dim)); + return errors::InvalidArgument("Input must have last dimension > n = ", + c->Value(n_dim), " but is ", + c->Value(last_dim)); } // Reduce last_dim for output tensor @@ -1782,6 +1782,33 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is expected to invoke these operators. )doc"); +REGISTER_OP("_MklEagerConv2DBackpropFilter") + .Input("input: T") + .Input("filter_sizes: int32") + .Input("out_backprop: T") + .Output("output: T") + .Attr("T: {bfloat16, float}") + .Attr("strides: list(int)") + .Attr("use_cudnn_on_gpu: bool = true") + .Attr(GetPaddingAttrStringWithExplicit()) + .Attr(GetExplicitPaddingsAttrString()) + .Attr(GetConvnetDataFormatAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle s; + TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(1, &s)); + TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s)); + c->set_output(0, s); + return Status::OK(); + }) + .Doc(R"doc( +MKL version of Conv2DBackpropFilter for Eager mode. Uses MKL DNN APIs +to compute the gradients of convolution with respect to the filter. + +NOTE Do not invoke this operator directly in Python. Eager Op rewrite pass is +expected to invoke these operators. +)doc"); + REGISTER_OP("__MklDummyConv2DBackpropFilterWithBias") .Input("input: T") .Input("filter_sizes: int32") @@ -1915,6 +1942,33 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is expected to invoke these operators. )doc"); +REGISTER_OP("_MklEagerConv2DBackpropInput") + .Input("input_sizes: int32") + .Input("filter: T") + .Input("out_backprop: T") + .Output("output: T") + .Attr("T: {bfloat16, float}") + .Attr("strides: list(int)") + .Attr("use_cudnn_on_gpu: bool = true") + .Attr(GetPaddingAttrStringWithExplicit()) + .Attr(GetExplicitPaddingsAttrString()) + .Attr(GetConvnetDataFormatAttrString()) + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle s; + TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s)); + TF_RETURN_IF_ERROR(c->WithRank(s, 4, &s)); + c->set_output(0, s); + return Status::OK(); + }) + .Doc(R"doc( +MKL version of Convolution2D backward input for Eager mode. Uses MKL DNN APIs +to compute the gradients of convolution with respect to the input. + +NOTE Do not invoke this operator directly in Python. Eager op rewrite is +expected to invoke these operators. +)doc"); + REGISTER_OP("_MklConv3D") .Input("input: T") .Input("filter: T") From bef6b1cfb6d856d908ff695cb02718f7dd526a72 Mon Sep 17 00:00:00 2001 From: David Norman Date: Fri, 5 Jul 2019 12:12:50 +0100 Subject: [PATCH 0064/3053] Fix compilation errors in exhaustive test --- .../xla/tests/exhaustive_op_test_utils.cc | 8 ++-- .../xla/tests/exhaustive_op_test_utils.h | 42 ++++++++++--------- .../xla/tests/exhaustive_unary_test.cc | 3 +- 3 files changed, 29 insertions(+), 24 deletions(-) diff --git a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.cc b/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.cc index 465da47faeb..02273d7debd 100644 --- a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.cc +++ b/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.cc @@ -58,19 +58,19 @@ ExhaustiveOpTestBase::CreateExhaustiveF32Ranges() { namespace { ExhaustiveOpTestBase::ErrorSpec DefaultF64SpecGenerator(float) { - return ExhaustiveOpTestBase::ErrorSpec{0.0001, 0.0001}; + return ExhaustiveOpTestBase::ErrorSpec(0.0001, 0.0001); } ExhaustiveOpTestBase::ErrorSpec DefaultF32SpecGenerator(float) { - return ExhaustiveOpTestBase::ErrorSpec{0.0001, 0.0001}; + return ExhaustiveOpTestBase::ErrorSpec(0.0001, 0.0001); } ExhaustiveOpTestBase::ErrorSpec DefaultF16SpecGenerator(float) { - return ExhaustiveOpTestBase::ErrorSpec{0.001, 0.001}; + return ExhaustiveOpTestBase::ErrorSpec(0.001, 0.001); } ExhaustiveOpTestBase::ErrorSpec DefaultBF16SpecGenerator(float) { - return ExhaustiveOpTestBase::ErrorSpec{0.002, 0.02}; + return ExhaustiveOpTestBase::ErrorSpec(0.002, 0.02); } } // namespace diff --git a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h b/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h index 3df4de295e3..b6db554cdaa 100644 --- a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h +++ b/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h @@ -30,6 +30,26 @@ limitations under the License. namespace xla { using Eigen::half; +namespace int_type { +template +struct IntegralTypeWithByteWidth {}; + +template <> +struct IntegralTypeWithByteWidth<2> { + using type = uint16; +}; + +template <> +struct IntegralTypeWithByteWidth<4> { + using type = uint32; +}; + +template <> +struct IntegralTypeWithByteWidth<8> { + using type = uint64; +}; +} + class ExhaustiveOpTestBase : public ClientLibraryTestBase { public: struct ErrorSpec { @@ -41,6 +61,8 @@ class ExhaustiveOpTestBase : public ClientLibraryTestBase { // spec; this only covers the case when both `expected` and `actual` are // equal to 0. bool strict_signed_zeros = false; + + ErrorSpec(float a, float r) : abs_err(a), rel_err(r) {} }; // `ty` is the primitive type being tested. @@ -140,24 +162,6 @@ class ExhaustiveOpTestBase : public ClientLibraryTestBase { } } - template - struct IntegralTypeWithByteWidth {}; - - template <> - struct IntegralTypeWithByteWidth<2> { - using type = uint16; - }; - - template <> - struct IntegralTypeWithByteWidth<4> { - using type = uint32; - }; - - template <> - struct IntegralTypeWithByteWidth<8> { - using type = uint64; - }; - // Converts part or all bits in an uint64 to the value of the floating point // data type being tested. // @@ -170,7 +174,7 @@ class ExhaustiveOpTestBase : public ClientLibraryTestBase { // T is the type of the floating value represented by the `bits`. template T ConvertValue(uint64 bits) { - using I = typename IntegralTypeWithByteWidth::type; + using I = typename int_type::IntegralTypeWithByteWidth::type; I used_bits = static_cast(bits); return BitCast(used_bits); } diff --git a/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc b/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc index 36584b43c59..761d84c2a8e 100644 --- a/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc +++ b/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc @@ -369,7 +369,8 @@ class Exhaustive32BitOrLessUnaryTest // type being tested. template void FillInput(Literal* input_literal) { - using IntegralT = typename IntegralTypeWithByteWidth::type; + using IntegralT = + typename int_type::IntegralTypeWithByteWidth::type; int64 input_size = input_literal->element_count(); int64 begin, end; std::tie(begin, end) = std::get<1>(GetParam()); From 2ff0abe8fef1f2c8105bdf81753625c59c102e71 Mon Sep 17 00:00:00 2001 From: Imran Salam Date: Sat, 6 Jul 2019 17:52:16 +0500 Subject: [PATCH 0065/3053] [TF 2.0 API Docs] tf.image.image_gradients Added usage example for image_gradients. The issue is raised in the link https://github.com/tensorflow/tensorflow/issues/30445 --- tensorflow/python/ops/image_ops_impl.py | 39 ++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index 211231714c6..c216aa885aa 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -3403,7 +3403,44 @@ def image_gradients(image): Returns: Pair of tensors (dy, dx) holding the vertical and horizontal image gradients (1-step finite difference). - + + Usage Example: + ```python + BATCH_SIZE = 1 + IMAGE_HEIGHT = 5 + IMAGE_WIDTH = 5 + CHANNELS = 1 + image = tf.reshape(tf.range(IMAGE_HEIGHT * IMAGE_WIDTH * CHANNELS, + delta=1, dtype=tf.float32), + shape=(BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS)) + dx, dy = tf.image.image_gradients(image) + print(image[0, :,:,0]) + print('-' * 20) + print(dx[0, :,:,0]) + print('-' * 20) + print(dy[0, :,:,0]) + tf.Tensor( + [[ 0. 1. 2. 3. 4.] + [ 5. 6. 7. 8. 9.] + [10. 11. 12. 13. 14.] + [15. 16. 17. 18. 19.] + [20. 21. 22. 23. 24.]], shape=(5, 5), dtype=float32) + -------------------- + tf.Tensor( + [[5. 5. 5. 5. 5.] + [5. 5. 5. 5. 5.] + [5. 5. 5. 5. 5.] + [5. 5. 5. 5. 5.] + [0. 0. 0. 0. 0.]], shape=(5, 5), dtype=float32) + -------------------- + tf.Tensor( + [[1. 1. 1. 1. 0.] + [1. 1. 1. 1. 0.] + [1. 1. 1. 1. 0.] + [1. 1. 1. 1. 0.] + [1. 1. 1. 1. 0.]], shape=(5, 5), dtype=float32) + ``` + Raises: ValueError: If `image` is not a 4D tensor. """ From 50fc84a6b3ec597eecd5a52f0f0689a33e2747d1 Mon Sep 17 00:00:00 2001 From: "Coady, Patrick" Date: Sun, 7 Jul 2019 10:08:53 -0400 Subject: [PATCH 0066/3053] Update loss docstrings to match behavior. --- tensorflow/python/keras/losses.py | 21 ++++++++++--------- .../python/ops/losses/loss_reduction.py | 10 ++++----- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py index b80fafbd61e..8c00b7543d7 100644 --- a/tensorflow/python/keras/losses.py +++ b/tensorflow/python/keras/losses.py @@ -95,21 +95,22 @@ class Loss(object): """Invokes the `Loss` instance. Args: - y_true: Ground truth values. - y_pred: The predicted values. - sample_weight: Optional `Tensor` whose rank is either 0, or the same rank - as `y_true`, or is broadcastable to `y_true`. `sample_weight` acts as a + y_true: Ground truth values. shape = `[batch_size, d0, .. dN]` + y_pred: The predicted values. shape = `[batch_size, d0, .. dN]` + sample_weight: Optional `sample_weight` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `sample_weight` is a tensor of size `[batch_size]`, then the total loss for each sample of the batch is rescaled by the corresponding element in the `sample_weight` vector. If - the shape of `sample_weight` matches the shape of `y_pred`, then the - loss of each measurable element of `y_pred` is scaled by the - corresponding value of `sample_weight`. + the shape of `sample_weight` is `[batch_size, d0, .. dN-1]` (or can be + broadcasted to this shape), then each loss element of `y_pred` is scaled + by the corresponding value of `sample_weight`. (Note on`dN-1`: all loss + functions reduce by 1 dimension, usually axis=-1.) Returns: - Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same - shape as `y_true`; otherwise, it is scalar. + Weighted loss float `Tensor`. If `reduction` is `NONE`, this has + shape `[batch_size, d0, .. dN-1]`; otherwise, it is scalar. (Note `dN-1` + because all loss functions reduce by 1 dimension, usually axis=-1.) Raises: ValueError: If the shape of `sample_weight` is invalid. @@ -163,7 +164,7 @@ class Loss(object): '`tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE` using global batch ' 'size like:\n```\nwith strategy.scope():\n' ' loss_obj = tf.keras.losses.CategoricalCrossentropy(' - 'reduction=tf.keras.losses.reduction.None)\n....\n' + 'reduction=tf.keras.losses.reduction.NONE)\n....\n' ' loss = tf.reduce_sum(loss_obj(labels, predictions)) * ' '(1. / global_batch_size)\n```\nPlease see ' 'https://www.tensorflow.org/alpha/tutorials/distribute/training_loops' diff --git a/tensorflow/python/ops/losses/loss_reduction.py b/tensorflow/python/ops/losses/loss_reduction.py index 483a325570b..7fdc7916440 100644 --- a/tensorflow/python/ops/losses/loss_reduction.py +++ b/tensorflow/python/ops/losses/loss_reduction.py @@ -28,10 +28,10 @@ class ReductionV2(object): used with `tf.distribute.Strategy`, outside of built-in training loops such as `tf.keras` `compile` and `fit`, we expect reduction value to be `SUM` or `NONE`. Using `AUTO` in that case will raise an error. - * `NONE`: Un-reduced weighted losses with the same shape as input. When this - reduction type used with built-in Keras training loops like - `fit`/`evaluate`, the unreduced vector loss is passed to the optimizer but - the reported loss will be a scalar value. + * `NONE`: Weighted losses with one dimension reduced (axis=-1, or axis + specified by loss function). When this reduction type used with built-in + Keras training loops like `fit`/`evaluate`, the unreduced vector loss is + passed to the optimizer but the reported loss will be a scalar value. * `SUM`: Scalar sum of weighted losses. * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses. This reduction type is not supported when used with @@ -42,7 +42,7 @@ class ReductionV2(object): ``` with strategy.scope(): loss_obj = tf.keras.losses.CategoricalCrossentropy( - reduction=tf.keras.losses.Reduction.None) + reduction=tf.keras.losses.Reduction.NONE) .... loss = tf.reduce_sum(loss_object(labels, predictions)) * (1. / global_batch_size) From 46b6cde864060e59f3b437f2b2be440798a1e40e Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Mon, 8 Jul 2019 00:22:04 -0700 Subject: [PATCH 0067/3053] Removed duplicated registration of Less with bfloat16 This fix tries to address the issue raised in 30476 where Op Less was registered twice which triggered `Multiple OpKernel registrations` error This fix removes the duplication. This fix fies 30476. Signed-off-by: Yong Tang --- tensorflow/core/kernels/cwise_op_less.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/cwise_op_less.cc b/tensorflow/core/kernels/cwise_op_less.cc index 563bb7d4566..062a029f069 100644 --- a/tensorflow/core/kernels/cwise_op_less.cc +++ b/tensorflow/core/kernels/cwise_op_less.cc @@ -18,8 +18,7 @@ limitations under the License. namespace tensorflow { REGISTER5(BinaryOp, CPU, "Less", functor::less, float, Eigen::half, double, bfloat16, int32); -REGISTER5(BinaryOp, CPU, "Less", functor::less, int64, uint8, int8, int16, - bfloat16); +REGISTER4(BinaryOp, CPU, "Less", functor::less, int64, uint8, int8, int16); #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM REGISTER7(BinaryOp, GPU, "Less", functor::less, float, Eigen::half, double, From 1ba4fd12134334aff10efc6a49930e3cee25f8c1 Mon Sep 17 00:00:00 2001 From: David Norman Date: Tue, 9 Jul 2019 07:25:32 +0100 Subject: [PATCH 0068/3053] Change the namespace int_type -> test_util --- tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h | 4 ++-- tensorflow/compiler/xla/tests/exhaustive_unary_test.cc | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h b/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h index b6db554cdaa..212c0e6f522 100644 --- a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h +++ b/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h @@ -30,7 +30,7 @@ limitations under the License. namespace xla { using Eigen::half; -namespace int_type { +namespace test_util { template struct IntegralTypeWithByteWidth {}; @@ -174,7 +174,7 @@ class ExhaustiveOpTestBase : public ClientLibraryTestBase { // T is the type of the floating value represented by the `bits`. template T ConvertValue(uint64 bits) { - using I = typename int_type::IntegralTypeWithByteWidth::type; + using I = typename test_util::IntegralTypeWithByteWidth::type; I used_bits = static_cast(bits); return BitCast(used_bits); } diff --git a/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc b/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc index 761d84c2a8e..f028e0aee48 100644 --- a/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc +++ b/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc @@ -370,7 +370,7 @@ class Exhaustive32BitOrLessUnaryTest template void FillInput(Literal* input_literal) { using IntegralT = - typename int_type::IntegralTypeWithByteWidth::type; + typename test_util::IntegralTypeWithByteWidth::type; int64 input_size = input_literal->element_count(); int64 begin, end; std::tie(begin, end) = std::get<1>(GetParam()); From 57902bfffd2153b09a8036b8e653d78cb11a9bdf Mon Sep 17 00:00:00 2001 From: Imran Salam Date: Tue, 9 Jul 2019 13:07:02 +0500 Subject: [PATCH 0069/3053] Changes in printing output Each output is shown after is subsequent print --- tensorflow/python/ops/image_ops_impl.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index c216aa885aa..0725cb169bb 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -3415,25 +3415,21 @@ def image_gradients(image): shape=(BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS)) dx, dy = tf.image.image_gradients(image) print(image[0, :,:,0]) - print('-' * 20) - print(dx[0, :,:,0]) - print('-' * 20) - print(dy[0, :,:,0]) - tf.Tensor( + tf.Tensor( [[ 0. 1. 2. 3. 4.] [ 5. 6. 7. 8. 9.] [10. 11. 12. 13. 14.] [15. 16. 17. 18. 19.] [20. 21. 22. 23. 24.]], shape=(5, 5), dtype=float32) - -------------------- - tf.Tensor( + print(dx[0, :,:,0]) + tf.Tensor( [[5. 5. 5. 5. 5.] [5. 5. 5. 5. 5.] [5. 5. 5. 5. 5.] [5. 5. 5. 5. 5.] - [0. 0. 0. 0. 0.]], shape=(5, 5), dtype=float32) - -------------------- - tf.Tensor( + [0. 0. 0. 0. 0.]], shape=(5, 5), dtype=float32) + print(dy[0, :,:,0]) + tf.Tensor( [[1. 1. 1. 1. 0.] [1. 1. 1. 1. 0.] [1. 1. 1. 1. 0.] From 7287d5b85c2c5e8790a7daa99a0817824cdf7503 Mon Sep 17 00:00:00 2001 From: Pete Blacker Date: Tue, 9 Jul 2019 11:12:40 +0100 Subject: [PATCH 0070/3053] Updates to PR #30362 to address PeteWardens review --- tensorflow/lite/experimental/micro/micro_interpreter.cc | 4 +++- .../lite/experimental/micro/simple_tensor_allocator.cc | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.cc b/tensorflow/lite/experimental/micro/micro_interpreter.cc index 3dc83edf458..393151a6dfd 100644 --- a/tensorflow/lite/experimental/micro/micro_interpreter.cc +++ b/tensorflow/lite/experimental/micro/micro_interpreter.cc @@ -78,7 +78,7 @@ MicroInterpreter::MicroInterpreter(const Model* model, subgraph_ = (*subgraphs)[0]; tensors_ = subgraph_->tensors(); operators_ = subgraph_->operators(); - + context_.tensors_size = tensors_->size(); context_.tensors = reinterpret_cast(tensor_allocator_->AllocateMemory( @@ -100,6 +100,8 @@ MicroInterpreter::MicroInterpreter(const Model* model, // If the system is big endian then convert weights from the flatbuffer from // little to big endian on startup so that it does not need to be done during // inference. + // NOTE: This requires that the flatbuffer is held in memory which can be + // modified by this process. if (!FLATBUFFERS_LITTLEENDIAN) { for (int t = 0; t < tensors_size(); ++t) { TfLiteTensor* thisTensor = &context_.tensors[t]; diff --git a/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc b/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc index 16eb01ecd4d..47b305a2202 100644 --- a/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc +++ b/tensorflow/lite/experimental/micro/simple_tensor_allocator.cc @@ -135,8 +135,9 @@ TfLiteStatus SimpleTensorAllocator::AllocateTensor( src_quantization->zero_point() && (src_quantization->zero_point()->size() > 0)) { result->params.scale = src_quantization->scale()->Get(0); - memcpy(&result->params.zero_point, - (int64_t*)src_quantization->zero_point()->Data(), sizeof(int64_t)); + for (int b = 0; b < sizeof(int64_t); ++b) + *(((char*)&result->params.zero_point) + b) = + *(((char*)src_quantization->zero_point()->Data()) + b); result->params.zero_point = flatbuffers::EndianScalar(result->params.zero_point); } From 41852334f233565bbb9aa73d7aa719ec21fb731c Mon Sep 17 00:00:00 2001 From: Pete Blacker Date: Tue, 9 Jul 2019 13:50:57 +0100 Subject: [PATCH 0071/3053] Fixed automatic downloading and install scripts for getting leon tool chain. --- .../lite/experimental/micro/testing/test_leon_binary.sh | 3 ++- .../experimental/micro/tools/make/download_and_extract.sh | 4 ++++ .../experimental/micro/tools/make/targets/leon_makefile.inc | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tensorflow/lite/experimental/micro/testing/test_leon_binary.sh b/tensorflow/lite/experimental/micro/testing/test_leon_binary.sh index 6a84322e1d4..0b42fa8249b 100755 --- a/tensorflow/lite/experimental/micro/testing/test_leon_binary.sh +++ b/tensorflow/lite/experimental/micro/testing/test_leon_binary.sh @@ -32,8 +32,9 @@ mkdir -p ${MICRO_LOG_PATH} SCRIPT_PATH="`dirname \"$BASH_SOURCE\"`" SCRIPT_PATH="`( cd \"$SCRIPT_PATH\" && pwd )`" LEON_COMMANDS="$SCRIPT_PATH/leon_commands" +TSIM_PATH="tensorflow/lite/experimental/micro/tools/make/downloads/tsim/tsim/linux-x64/tsim-leon3" -tsim-leon3 $1 -c ${LEON_COMMANDS} 2>&1 | tee ${MICRO_LOG_FILENAME} +${TSIM_PATH} $1 -c ${LEON_COMMANDS} 2>&1 | tee ${MICRO_LOG_FILENAME} if grep -q "$2" ${MICRO_LOG_FILENAME} then diff --git a/tensorflow/lite/experimental/micro/tools/make/download_and_extract.sh b/tensorflow/lite/experimental/micro/tools/make/download_and_extract.sh index 37ad740ec64..8c22fdc5289 100755 --- a/tensorflow/lite/experimental/micro/tools/make/download_and_extract.sh +++ b/tensorflow/lite/experimental/micro/tools/make/download_and_extract.sh @@ -92,6 +92,8 @@ download_and_extract() { if [[ "${url}" == *gz ]]; then tar -C "${dir}" --strip-components=1 -xzf ${tempfile} + elif [[ "${url}" == *tar.xz ]]; then + tar -C "${dir}" --strip-components=1 -xf ${tempfile} elif [[ "${url}" == *bz2 ]]; then curl -Ls "${url}" > ${tempdir}/tarred.bz2 tar -C "${dir}" --strip-components=1 -xjf ${tempfile} @@ -106,6 +108,8 @@ download_and_extract() { else cp -R ${tempdir2}/* ${dir}/ fi + else + echo "Error unsupported archive type. Failed to extract tool after download." fi rm -rf ${tempdir2} ${tempdir} diff --git a/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc b/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc index 1504a09d1b8..7d7832411b3 100644 --- a/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc +++ b/tensorflow/lite/experimental/micro/tools/make/targets/leon_makefile.inc @@ -4,7 +4,7 @@ ifeq ($(TARGET), leon) CXXFLAGS += -std=c++11 $(PLATFORM_FLAGS) CCFLAGS += $(PLATFORM_FLAGS) TARGET_ARCH := leon - TARGET_TOOLCHAIN_PREFIX := sparc-gaisler-elf- + TARGET_TOOLCHAIN_PREFIX := tensorflow/lite/experimental/micro/tools/make/downloads/leon_bcc2/bin/sparc-gaisler-elf- TEST_SCRIPT := tensorflow/lite/experimental/micro/testing/test_leon_binary.sh GCC_LEON := $(MAKEFILE_DIR)/downloads/leon_bcc2/ From 65985751a9def39929f582e78e6b434d9909f1c7 Mon Sep 17 00:00:00 2001 From: jerryyin Date: Fri, 28 Jun 2019 18:52:51 +0000 Subject: [PATCH 0072/3053] [ROCm] Adding support to depthwise_conv_op --- tensorflow/core/kernels/depthwise_conv_op.cc | 12 ++- tensorflow/core/kernels/depthwise_conv_op.h | 2 +- .../core/kernels/depthwise_conv_op_gpu.h | 73 +++++++++++-------- .../depthwise_conv_op_gpu_double.cu.cc | 4 +- .../kernels/depthwise_conv_op_gpu_float.cu.cc | 4 +- .../kernels/depthwise_conv_op_gpu_half.cu.cc | 4 +- 6 files changed, 56 insertions(+), 43 deletions(-) diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc index ceaeaac21de..a7a0088fd3d 100644 --- a/tensorflow/core/kernels/depthwise_conv_op.cc +++ b/tensorflow/core/kernels/depthwise_conv_op.cc @@ -38,10 +38,14 @@ limitations under the License. #include "tensorflow/core/util/use_cudnn.h" #include "tensorflow/core/util/work_sharder.h" +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + #if GOOGLE_CUDA #include "third_party/gpus/cudnn/cudnn.h" +#endif + #include "tensorflow/core/platform/stream_executor.h" -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM namespace tensorflow { @@ -246,7 +250,7 @@ extern template struct LaunchConv2DOp; extern template struct LaunchConv2DOp; extern template struct LaunchConv2DOp; -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM // Extern template instantiated in conv_ops.cc. extern template struct LaunchConv2DOp; @@ -461,7 +465,7 @@ TF_CALL_float(REGISTER_CPU_KERNEL); TF_CALL_double(REGISTER_CPU_KERNEL); #endif -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU_KERNEL(T) \ REGISTER_KERNEL_BUILDER( \ @@ -494,6 +498,6 @@ TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL); TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL); TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL); #endif // CUDNN_VERSION -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } // namespace tensorflow diff --git a/tensorflow/core/kernels/depthwise_conv_op.h b/tensorflow/core/kernels/depthwise_conv_op.h index b2d58988913..508a25e3397 100644 --- a/tensorflow/core/kernels/depthwise_conv_op.h +++ b/tensorflow/core/kernels/depthwise_conv_op.h @@ -80,7 +80,7 @@ struct LaunchDepthwiseConvBackpropFilterOp { TensorFormat data_format); }; -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM template struct LaunchDepthwiseConvOp { void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.h b/tensorflow/core/kernels/depthwise_conv_op_gpu.h index 721088f80ba..ec13259127e 100644 --- a/tensorflow/core/kernels/depthwise_conv_op_gpu.h +++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.h @@ -16,11 +16,10 @@ limitations under the License. #ifndef TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_ #define TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" -#include "third_party/cub/util_ptx.cuh" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/kernels/depthwise_conv_op.h" #include "tensorflow/core/platform/types.h" @@ -79,7 +78,7 @@ inline EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall( // convolution depending on a template argument of this enum. enum DepthwiseConv2dDirection { DIRECTION_FORWARD, DIRECTION_BACKWARD }; -// A Cuda kernel to compute the depthwise convolution forward pass +// A Gpu kernel to compute the depthwise convolution forward pass // in NHWC format. template @@ -103,7 +102,7 @@ __global__ void __launch_bounds__(1024, 2) const int out_width = args.out_cols; const int out_depth = args.out_depth; - CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) { + GPU_1D_KERNEL_LOOP(thread_id, num_outputs) { // Compute the indexes of this thread in the output. const int out_channel = thread_id % out_depth; const int out_col = (thread_id / out_depth) % out_width; @@ -192,8 +191,10 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall( typedef typename detail::PseudoHalfType::Type S; assert(CanLaunchDepthwiseConv2dGPUSmall(args)); // Holds block plus halo and filter data for blockDim.x depths. - GPU_DYNAMIC_SHARED_MEM_DECL(8, unsigned char, shared_memory); static_assert(sizeof(S) <= 8, "Insufficient alignment detected"); + + GPU_DYNAMIC_SHARED_MEM_DECL(8, unsigned char, shared_memory); + S* const shared_data = reinterpret_cast(shared_memory); const int num_batches = args.batch; @@ -323,7 +324,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall( } } -// A Cuda kernel to compute the depthwise convolution forward pass +// A Gpu kernel to compute the depthwise convolution forward pass // in NCHW format. template @@ -347,7 +348,7 @@ __global__ void __launch_bounds__(1024, 2) const int out_width = args.out_cols; const int out_depth = args.out_depth; - CUDA_1D_KERNEL_LOOP(thread_id, num_outputs) { + GPU_1D_KERNEL_LOOP(thread_id, num_outputs) { // Compute the indexes of this thread in the output. // // We want coalesced reads so we make sure that each warp reads @@ -480,8 +481,10 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall( typedef typename detail::PseudoHalfType::Type S; assert(CanLaunchDepthwiseConv2dGPUSmall(args)); // Holds block plus halo and filter data for blockDim.z depths. - GPU_DYNAMIC_SHARED_MEM_DECL(8, unsigned char, shared_memory); static_assert(sizeof(S) <= 8, "Insufficient alignment detected"); + + GPU_DYNAMIC_SHARED_MEM_DECL(8, unsigned char, shared_memory); + S* const shared_data = reinterpret_cast(shared_memory); const int num_batches = args.batch; @@ -779,7 +782,7 @@ Status LaunchDepthwiseConv2dGPU(OpKernelContext* ctx, const DepthwiseArgs& args, } } -// A simple launch pad to launch the Cuda kernel for depthwise convolution. +// A simple launch pad to launch the Gpu kernel for depthwise convolution. template void LaunchDepthwiseConvOp::operator()(OpKernelContext* ctx, const DepthwiseArgs& args, @@ -795,7 +798,7 @@ void LaunchDepthwiseConvOp::operator()(OpKernelContext* ctx, } } -// A Cuda kernel to compute the depthwise convolution backprop w.r.t. input. +// A GPU kernel to compute the depthwise convolution backprop w.r.t. input. template __global__ void __launch_bounds__(640, 2) @@ -819,7 +822,7 @@ __global__ void __launch_bounds__(640, 2) const int out_width = args.out_cols; const int out_depth = args.out_depth; - CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) { + GPU_1D_KERNEL_LOOP(thread_id, num_in_backprop) { // Compute the indexes of this thread in the output. const int in_channel = thread_id % in_depth; const int in_col = (thread_id / in_depth) % in_width; @@ -891,7 +894,7 @@ __global__ void __launch_bounds__(640, 2) // TODO(vrv): Consider assigning threads to output and using // atomics for accumulation, similar to the filter case. - CUDA_1D_KERNEL_LOOP(thread_id, num_in_backprop) { + GPU_1D_KERNEL_LOOP(thread_id, num_in_backprop) { // Compute the indexes of this thread in the input. const int in_col = thread_id % in_width; const int in_row = (thread_id / in_width) % in_height; @@ -998,7 +1001,7 @@ Status LaunchDepthwiseConv2dBackpropInputGPU(OpKernelContext* ctx, } } -// A simple launch pad to launch the Cuda kernel for depthwise convolution. +// A simple launch pad to launch the Gpu kernel for depthwise convolution. template void LaunchDepthwiseConvBackpropInputOp::operator()( OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop, @@ -1014,7 +1017,7 @@ void LaunchDepthwiseConvBackpropInputOp::operator()( } } -// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. +// A GPU kernel to compute the depthwise convolution backprop w.r.t. filter. // TODO: Add fp32 accumulation to half calls of this function. This addition // is non-trivial as the partial sums are added directly to the output template +#if GOOGLE_CUDA __device__ __forceinline__ T WarpSumReduce(T val) { +#elif TENSORFLOW_USE_ROCM +__device__ inline T WarpSumReduce(T val) { +#endif // support only power-of-two widths. assert(__popc(kWidth) == 1); - int sub_warp = cub::LaneId() / kWidth; + int sub_warp = GpuLaneId() / kWidth; int zeros = sub_warp * kWidth; unsigned mask = ((1UL << kWidth) - 1) << zeros; for (int delta = kWidth / 2; delta > 0; delta /= 2) { - val += CudaShuffleXorSync(mask, val, delta); + val += GpuShuffleXorSync(mask, val, delta); } return val; } @@ -1158,8 +1165,10 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall( typedef typename detail::PseudoHalfType::Type S; assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z)); // Holds block plus halo and filter data for blockDim.x depths. - GPU_DYNAMIC_SHARED_MEM_DECL(8, unsigned char, shared_memory); static_assert(sizeof(S) <= 8, "Insufficient alignment detected"); + + GPU_DYNAMIC_SHARED_MEM_DECL(8, unsigned char, shared_memory); + S* const shared_data = reinterpret_cast(shared_memory); const int num_batches = args.batch; @@ -1253,7 +1262,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall( // Note: the condition to reach this is uniform across the entire block. __syncthreads(); - unsigned active_threads = CudaBallotSync(kCudaWarpAll, channel_in_range); + unsigned active_threads = GpuBallotSync(kCudaWarpAll, channel_in_range); if (channel_in_range) { const T* const out_ptr = inout_offset + output; @@ -1268,7 +1277,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall( S val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset]; // Warp-accumulate pixels of the same depth and write to accumulator. for (int delta = 16; delta >= kBlockDepth; delta /= 2) { - val += CudaShuffleXorSync(active_threads, val, delta); + val += GpuShuffleXorSync(active_threads, val, delta); } if (!(thread_idx & 32 - kBlockDepth) /* lane_idx < kBlockDepth */) { *accum_ptr = val; @@ -1294,14 +1303,14 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall( // Warp-accumulate the pixels of the same depth from the accumulator. val = WarpSumReduce(val); if (!(thread_idx & kAccumPixels - 1)) { - CudaAtomicAdd(filter_offset + filter, static_cast(val)); + GpuAtomicAdd(filter_offset + filter, static_cast(val)); } } } } } -// A Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. +// A Gpu kernel to compute the depthwise convolution backprop w.r.t. filter. template __global__ void __launch_bounds__(640, 2) @@ -1326,7 +1335,7 @@ __global__ void __launch_bounds__(640, 2) const int out_width = args.out_cols; const int out_depth = args.out_depth; - CUDA_1D_KERNEL_LOOP(thread_id, num_out_backprop) { + GPU_1D_KERNEL_LOOP(thread_id, num_out_backprop) { // Compute the indexes of this thread in the output. const int out_col = thread_id % out_width; const int out_row = (thread_id / out_width) % out_height; @@ -1370,7 +1379,7 @@ __global__ void __launch_bounds__(640, 2) (dm + depth_multiplier * (in_channel + in_depth * (filter_col + filter_width * filter_row))); - CudaAtomicAdd(addr, partial_sum); + GpuAtomicAdd(addr, partial_sum); } } } else { @@ -1402,7 +1411,7 @@ __global__ void __launch_bounds__(640, 2) // contention on the destination; 2. Have each thread compute one // gradient for an element in the filters. This should work well // when the input depth is big and filter size is not too small. - CudaAtomicAdd(addr, partial_sum); + GpuAtomicAdd(addr, partial_sum); } } } @@ -1521,7 +1530,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall( // Note: the condition to reach this is uniform across the entire block. __syncthreads(); - unsigned active_threads = CudaBallotSync(kCudaWarpAll, channel_in_range); + unsigned active_threads = GpuBallotSync(kCudaWarpAll, channel_in_range); if (channel_in_range) { const T* const out_ptr = inout_offset + output; @@ -1536,7 +1545,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall( S val = out1 * tile_ptr[0] + out2 * tile_ptr[tile_offset]; // Warp-accumulate pixels of the same depth and write to accumulator. for (int delta = 16 / kBlockDepth; delta > 0; delta /= 2) { - val += CudaShuffleXorSync(active_threads, val, delta); + val += GpuShuffleXorSync(active_threads, val, delta); } if (!(thread_idx & 32 / kBlockDepth - 1)) { *accum_ptr = val; // kBlockDepth threads per warp. @@ -1563,7 +1572,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall( // Warp-accumulate pixels of the same depth from the accumulator. val = WarpSumReduce(val); if (!(thread_idx & kAccumPixels - 1)) { - CudaAtomicAdd(filter_offset + filter, static_cast(val)); + GpuAtomicAdd(filter_offset + filter, static_cast(val)); } } } @@ -1745,7 +1754,7 @@ Status LaunchDepthwiseConv2dBackpropFilterGPU( } } -// A simple launch pad to launch the Cuda kernel for depthwise convolution. +// A simple launch pad to launch the Gpu kernel for depthwise convolution. template void LaunchDepthwiseConvBackpropFilterOp::operator()( OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop, @@ -1769,6 +1778,6 @@ void LaunchDepthwiseConvBackpropFilterOp::operator()( } } } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM #endif // TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_GPU_H_ diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc index 073e7cf2698..1e4b3390d7f 100644 --- a/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc +++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_double.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include "tensorflow/core/kernels/depthwise_conv_op.h" @@ -27,4 +27,4 @@ template struct LaunchDepthwiseConvBackpropInputOp; template struct LaunchDepthwiseConvBackpropFilterOp; } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc index 4b0e15e4766..946cb650668 100644 --- a/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc +++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_float.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include "tensorflow/core/kernels/depthwise_conv_op.h" @@ -27,4 +27,4 @@ template struct LaunchDepthwiseConvBackpropInputOp; template struct LaunchDepthwiseConvBackpropFilterOp; } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc index 2db9fa4dff5..c1fe5dfa5b1 100644 --- a/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc +++ b/tensorflow/core/kernels/depthwise_conv_op_gpu_half.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include "tensorflow/core/kernels/depthwise_conv_op.h" @@ -27,4 +27,4 @@ template struct LaunchDepthwiseConvBackpropInputOp; template struct LaunchDepthwiseConvBackpropFilterOp; } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM From dca9fadebc0b2d54e084126566948a8fe993644d Mon Sep 17 00:00:00 2001 From: Bhavani Subramanian Date: Tue, 9 Jul 2019 10:42:10 -0700 Subject: [PATCH 0073/3053] Added support for common utility functions used by MKL-DNN enabled kernels for MKL-DNN v1.0. --- tensorflow/core/util/mkl_util.h | 703 +++++++++++++++++++++++++++++++- 1 file changed, 698 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index 1b62dad8878..bf13f9d8370 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -122,14 +122,83 @@ enum class MklQuantization { static const int kSmallBatchSize = 32; -// Forward decl +#ifdef ENABLE_MKLDNN_V1 +// In MKL-DNN v1.0, the format (ex. NCHW) used to initialize a memory descriptor +// (md) structure will no longer be recorded in its `format` field. Instead, it +// will be set to a canonical `blocked` format for every fully described md. +// +// Currently, we query this `format` field while mapping MKL-DNN's data format +// to TF's data format. Due to the above restriction, we will now get this data +// format information from TF's `data_format` attribute (i.e. via +// `TensorFormat`) for MKL-DNN v1.0. +// +// Since MKL-DNN operators such as ReLU do not have a `data_format` attribute +// (since they are in `blocked` format), we need to be able to distinguish +// between blocked and non-blocked formats. For this, we have defined a new +// enum called `MklTensorFormat` which is similar to `TensorFormat` but with +// an additional field called `FORMAT_UNDEF`, which could mean one of the +// following depending on the context: +// +// 1) Blocked format: as described above, this is needed for element-wise +// operators such as ReLU. +// 2) Invalid format: ex. unsupported format +// TODO(bhavanis): Do we need a separate field for invalid formats? +enum class MklTensorFormat { + FORMAT_NHWC = 0, + FORMAT_NCHW = 1, + FORMAT_NDHWC = 2, + FORMAT_NCDHW = 3, + FORMAT_UNDEF = 4, // either blocked or invalid +}; +#endif + +#ifdef ENABLE_MKLDNN_V1 +// Forward declarations +TensorFormat MklDnn3DDataFormatToTFDataFormat(MklTensorFormat format); +TensorFormat MklDnnDataFormatToTFDataFormat(MklTensorFormat format); +memory::format_tag MklTensorFormatToMklDnnDataFormat(MklTensorFormat format); +#else +// Forward declarations TensorFormat MklDnn3DDataFormatToTFDataFormat(memory::format format); TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format); +#endif memory::dims CalculateTFStrides(const memory::dims& dims_tf_order); memory::desc CreateBlockedMemDescHelper(const memory::dims& dim, const memory::dims& strides, memory::data_type dtype); +#ifdef ENABLE_MKLDNN_V1 +typedef std::unordered_map MemoryArgsMap; +inline std::ostream& operator<<(std::ostream& os, + const memory::format_tag& tag) { + if (tag == memory::format_tag::undef) { + os << "undef"; + } else if (tag == memory::format_tag::any) { + os << "any"; + } else { + os << "invalid"; + } + return os; +} + +inline std::ostream& operator<<(std::ostream& os, + const MklTensorFormat& format) { + if (format == MklTensorFormat::FORMAT_NHWC) { + os << "FORMAT_NHWC"; + } else if (format == MklTensorFormat::FORMAT_NCHW) { + os << "FORMAT_NCHW"; + } else if (format == MklTensorFormat::FORMAT_NDHWC) { + os << "FORMAT_NDHWC"; + } else if (format == MklTensorFormat::FORMAT_NCDHW) { + os << "FORMAT_NCDHW"; + } else if (format == MklTensorFormat::FORMAT_UNDEF) { + os << "FORMAT_UNDEF"; + } else { + os << "INVALID FORMAT"; + } +} +#endif + class MklDnnShape { private: typedef struct { @@ -139,8 +208,13 @@ class MklDnnShape { size_t dimension_ = 0; /// Required by MKLDNN for conversions mkldnn_dims_t sizes_; // Required by MKL for conversions +#ifdef ENABLE_MKLDNN_V1 + MklTensorFormat tf_data_format_ = MklTensorFormat::FORMAT_UNDEF; + memory::data_type T_ = memory::data_type::undef; +#else memory::format tf_data_format_ = memory::format::format_undef; memory::data_type T_ = memory::data_type::data_undef; +#endif // MKL layout mkldnn_memory_desc_t mkl_md_; /// TF dimension corresponding to this MKL dimension @@ -183,6 +257,27 @@ class MklDnnShape { return true; } +#ifdef ENABLE_MKLDNN_V1 + /// Equality function for MklDnnShape objects + /// @return true if both are equal; false otherwise. + inline bool operator==(const MklDnnShape& input_shape) const { + if (this->IsMklTensor() != input_shape.IsMklTensor()) { + return false; + } + + // If input tensors are in Mkl layout, then we check for dimensions and + // sizes. + if (this->IsMklTensor()) { + const mkldnn_memory_desc_t& cur_md = (this->GetMklLayout()).data; + const mkldnn_memory_desc_t& input_shape_md = + input_shape.GetMklLayout().data; + return this->GetTfShape() == input_shape.GetTfShape() && + mkldnn_memory_desc_equal(&cur_md, &input_shape_md); + } + + return true; + } +#else /// Equality function for MklDnnShape objects /// @return true if both are equal; false otherwise. inline bool operator==(const MklDnnShape& input_shape) const { @@ -200,6 +295,7 @@ class MklDnnShape { return true; } +#endif /// Equality operator for MklDnnShape and TFShape. /// Returns: true if TF shapes for both are the same, false otherwise @@ -299,7 +395,13 @@ class MklDnnShape { CHECK_EQ(data_.is_mkl_tensor_, true); std::vector shape(data_.dimension_, -1); +#ifdef ENABLE_MKLDNN_V1 + // As mentioned in the comment above, we now rely on TF's `data_format` + // attribute to determine if TF shape is in blocked format or not. + if (data_.tf_data_format_ != MklTensorFormat::FORMAT_UNDEF) { +#else if (data_.tf_data_format_ != memory::format::blocked) { +#endif for (size_t idx = 0; idx < data_.dimension_; ++idx) { shape[idx] = data_.sizes_[TfDimIdx(idx)]; } @@ -321,10 +423,13 @@ class MklDnnShape { inline void SetElemType(memory::data_type dt) { data_.T_ = dt; } inline const memory::data_type GetElemType() { return data_.T_; } +#ifndef ENABLE_MKLDNN_V1 + // Memory primitive descriptor is deprecated in MKL-DNN v1.0. inline void SetMklLayout(memory::primitive_desc* pd) { CHECK_NOTNULL(pd); data_.mkl_md_ = pd->desc().data; } +#endif inline void SetMklLayout(memory::desc* md) { CHECK_NOTNULL(md); @@ -335,9 +440,67 @@ class MklDnnShape { return memory::desc(data_.mkl_md_); } +#ifdef ENABLE_MKLDNN_V1 + inline MklTensorFormat GetTfDataFormat() const { + return data_.tf_data_format_; + } + + /// We don't create primitive_descriptor for TensorFlow layout now. + /// We use lazy evaluation and create it only when needed. Input format can + /// also be Blocked format. + inline void SetTfLayout(size_t dims, const memory::dims& sizes, + MklTensorFormat format) { + DCHECK_EQ(dims, sizes.size()) + << "SetTfLayout: Number of dimensions does not" + "match with dimension array"; + data_.dimension_ = dims; + for (size_t ii = 0; ii < dims; ++ii) { + data_.sizes_[ii] = sizes[ii]; + } + data_.tf_data_format_ = format; + if (format != MklTensorFormat::FORMAT_UNDEF) { + SetTfDimOrder(dims, format); + } + } + + inline void SetTfLayout2D(size_t dims, const memory::dims& sizes, + MklTensorFormat format) { + DCHECK_EQ(dims, sizes.size()) + << "SetTfLayout2D: Number of dimensions does not" + "match with dimension array"; + data_.dimension_ = dims; + for (size_t ii = 0; ii < dims; ++ii) { + data_.sizes_[ii] = sizes[ii]; + } + data_.tf_data_format_ = format; + if (format != MklTensorFormat::FORMAT_UNDEF) { + data_.map_[0] = MklDnnDims::Dim_N; + data_.map_[1] = MklDnnDims::Dim_C; + } + } + + inline const memory::desc GetTfLayout() const { + memory::dims dims; + for (size_t ii = 0; ii < data_.dimension_; ++ii) { + dims.push_back(data_.sizes_[ii]); + } + + // Create Blocked memory desc if input TF format was set like that. + if (data_.tf_data_format_ == MklTensorFormat::FORMAT_UNDEF) { + auto strides = CalculateTFStrides(dims); + return CreateBlockedMemDescHelper(dims, strides, data_.T_); + } else { + auto format_tag = + MklTensorFormatToMklDnnDataFormat(data_.tf_data_format_); + DCHECK_NE(format_tag, memory::format_tag::undef); + return memory::desc(dims, data_.T_, format_tag); + } + } +#else inline memory::format GetTfDataFormat() const { return data_.tf_data_format_; } + /// We don't create primitive_descriptor for TensorFlow layout now. /// We use lazy evaluation and create it only when needed. Input format can /// also be Blocked format. @@ -386,6 +549,7 @@ class MklDnnShape { return memory::desc(dims, data_.T_, data_.tf_data_format_); } } +#endif inline const memory::desc GetCurLayout() const { return IsMklTensor() ? GetMklLayout() : GetTfLayout(); @@ -424,10 +588,17 @@ class MklDnnShape { } } +#ifdef ENABLE_MKLDNN_V1 + inline void SetTfDimOrder(const size_t dimension, MklTensorFormat format) { + TensorFormat data_format = MklDnnDataFormatToTFDataFormat(format); + SetTfDimOrder(dimension, data_format); + } +#else inline void SetTfDimOrder(const size_t dimension, memory::format format) { TensorFormat data_format = MklDnnDataFormatToTFDataFormat(format); SetTfDimOrder(dimension, data_format); } +#endif inline const mkldnn_dim_t* GetTfToMklDimMap() const { return &data_.map_[0]; } inline size_t TfDimIdx(int index) const { return data_.map_[index]; } @@ -528,29 +699,52 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor, context->allocate_temp(DataTypeToEnum::v(), output_shape, &output_tensor); +#ifdef ENABLE_MKLDNN_V1 + engine cpu_engine(engine::kind::cpu, 0); + stream cpu_stream(cpu_engine); +#else auto cpu_engine = engine(engine::cpu, 0); +#endif MklDnnData input(&cpu_engine); // Get Mkl layout of input tensor. auto input_mkl_md = mkl_shape.GetMklLayout(); auto output_tf_md = mkl_shape.GetTfLayout(); +#ifndef ENABLE_MKLDNN_V1 + // Memory primitive descriptor is deprecated in MKL-DNN v1.0. auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine); +#endif input.SetUsrMem(input_mkl_md, &mkl_tensor); - // reorder +#ifdef ENABLE_MKLDNN_V1 + // Reorder + if (input.IsReorderNeeded(output_tf_md)) { + std::vector net; + std::vector net_args; + DCHECK_EQ(input.CheckReorderToOpMem(output_tf_md, &output_tensor, net, + net_args, &cpu_engine), + true); + DCHECK_EQ(net.size(), net_args.size()); + for (size_t i = 0; i < net.size(); ++i) { + net.at(i).execute(cpu_stream, net_args.at(i)); + } + cpu_stream.wait(); +#else + // Reorder if (input.IsReorderNeeded(output_tf_pd)) { std::vector net; CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net), true); stream(stream::kind::eager).submit(net).wait(); +#endif } else { // If not, just forward input tensor to output tensor. CHECK(output_tensor.CopyFrom(mkl_tensor, output_shape)); } } catch (mkldnn::error& e) { - string error_msg = "Status: " + std::to_string(e.status) + - ", message: " + string(e.message) + ", in file " + - string(__FILE__) + ":" + std::to_string(__LINE__); + string error_msg = "Status: " + std::to_string(e.status) + ", message: " + + string(e.message) + ", in file " + string(__FILE__) + + ":" + std::to_string(__LINE__); LOG(FATAL) << "Operation received an exception: " << error_msg; } return output_tensor; @@ -646,6 +840,17 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, // Allocates a temp tensor and returns the data buffer for temporary storage. template +#ifdef ENABLE_MKLDNN_V1 +inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, + const memory::desc& md, void** buf_out) { + TensorShape tf_shape; + + tf_shape.AddDim(md.get_size() / sizeof(T) + 1); + OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum::v(), + tf_shape, tensor_out)); + *buf_out = static_cast(tensor_out->flat().data()); +} +#else inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, const memory::primitive_desc& pd, void** buf_out) { TensorShape tf_shape; @@ -655,6 +860,7 @@ inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, tf_shape, tensor_out)); *buf_out = static_cast(tensor_out->flat().data()); } +#endif template inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, @@ -663,6 +869,24 @@ inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, tf_shape, tensor_out)); } +#ifdef ENABLE_MKLDNN_V1 +inline void GetStridesFromSizes(MklTensorFormat data_format, size_t* strides, + const size_t* sizes) { + DCHECK_NE(data_format, MklTensorFormat::FORMAT_UNDEF); + // MKL requires strides in NCHW + if (data_format == MklTensorFormat::FORMAT_NHWC) { + strides[0] = sizes[2]; + strides[1] = sizes[0] * sizes[2]; + strides[2] = 1; + strides[3] = sizes[0] * sizes[1] * sizes[2]; + } else { + strides[0] = 1; + strides[1] = sizes[0]; + strides[2] = sizes[0] * sizes[1]; + strides[3] = sizes[0] * sizes[1] * sizes[2]; + } +} +#else inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides, const size_t* sizes) { // MKL requires strides in NCHW @@ -678,6 +902,7 @@ inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides, strides[3] = sizes[0] * sizes[1] * sizes[2]; } } +#endif inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in, int idx_out) { @@ -832,6 +1057,67 @@ memory::data_type MklDnnType() { return memory::data_type::f32; } +#ifdef ENABLE_MKLDNN_V1 +// Map MklTensorFormat to MKL-DNN format tag +// +// @input: MklTensorFormat i.e. TensorFlow data format +// @return: MKL-DNN's memory format tag corresponding to MklTensorFormat. +// Fails with an error if invalid data format. +inline memory::format_tag MklTensorFormatToMklDnnDataFormat( + MklTensorFormat format) { + DCHECK_NE(format, MklTensorFormat::FORMAT_UNDEF); + using tag = memory::format_tag; + if (format == MklTensorFormat::FORMAT_NHWC) return tag::nhwc; + if (format == MklTensorFormat::FORMAT_NCHW) return tag::nchw; + if (format == MklTensorFormat::FORMAT_NDHWC) return tag::ndhwc; + if (format == MklTensorFormat::FORMAT_NCDHW) return tag::ncdhw; + return tag::undef; +} +#endif + +#ifdef ENABLE_MKLDNN_V1 +/// Map TensorFlow data format into MKL-DNN 3D data format +/// @input: TensorFlow data format +/// @return: MKL-DNN 3D data format corresponding to TensorFlow data format; +/// Fails with an error if invalid data format. +inline MklTensorFormat TFDataFormatToMklDnn3DDataFormat(TensorFormat format) { + if (format == FORMAT_NHWC) return MklTensorFormat::FORMAT_NDHWC; + if (format == FORMAT_NCHW) return MklTensorFormat::FORMAT_NCDHW; + TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format")); + return MklTensorFormat::FORMAT_UNDEF; // Invalid format +} + +/// Map TensorFlow data format into MKL-DNN data format +/// +/// @input: TensorFlow data format +/// @return: MKL-DNN data format corresponding to TensorFlow data format; +/// Fails with an error if invalid data format. +inline MklTensorFormat TFDataFormatToMklDnnDataFormat(TensorFormat format) { + if (format == FORMAT_NHWC) return MklTensorFormat::FORMAT_NHWC; + if (format == FORMAT_NCHW) return MklTensorFormat::FORMAT_NCHW; + TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format")); + return MklTensorFormat::FORMAT_UNDEF; // Invalid format +} + +/// Map MKL-DNN data format into TensorFlow data format +/// +/// @input: MKL-DNN data format +/// @return: Tensorflow data format corresponding to MKL-DNN data format; +/// Fails with an error if invalid data format. +inline TensorFormat MklDnnDataFormatToTFDataFormat(MklTensorFormat format) { + if (format == MklTensorFormat::FORMAT_NHWC || + format == MklTensorFormat::FORMAT_NDHWC) + return FORMAT_NHWC; + else if (format == MklTensorFormat::FORMAT_NCHW || + format == MklTensorFormat::FORMAT_NCDHW) + return FORMAT_NCHW; + TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format")); + + // Return to prevent compiler warnings, otherwise TF_CHECK_OK will ensure + // that we don't come here. + return FORMAT_NHWC; +} +#else /// Map TensorFlow's data format into MKL-DNN 3D data format /// @input: TensorFlow data format /// @return: memory::format corresponding to TensorFlow data format; @@ -875,6 +1161,7 @@ inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) { // that we don't come here. return FORMAT_NHWC; } +#endif /// Map TensorShape object into memory::dims required by MKL-DNN /// @@ -905,7 +1192,11 @@ inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape, TensorFormat format) { // Check validity of format. CHECK_NE(TFDataFormatToMklDnnDataFormat(format), +#ifdef ENABLE_MKLDNN_V1 + MklTensorFormat::FORMAT_UNDEF); +#else memory::format::format_undef); +#endif int n = shape.dim_size(GetTensorDimIndex(format, 'N')); int c = shape.dim_size(GetTensorDimIndex(format, 'C')); @@ -920,7 +1211,11 @@ inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape, TensorFormat format) { // Validate format. CHECK_NE(TFDataFormatToMklDnn3DDataFormat(format), +#ifdef ENABLE_MKLDNN_V1 + MklTensorFormat::FORMAT_UNDEF); +#else memory::format::format_undef); +#endif int n = shape.dim_size(GetTensorDimIndex<3>(format, 'N')); int c = shape.dim_size(GetTensorDimIndex<3>(format, 'C')); @@ -938,7 +1233,11 @@ inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims, TensorFormat format) { // Validate format. CHECK_NE(TFDataFormatToMklDnnDataFormat(format), +#ifdef ENABLE_MKLDNN_V1 + MklTensorFormat::FORMAT_UNDEF); +#else memory::format::format_undef); +#endif int n = in_dims[GetTensorDimIndex(format, 'N')]; int c = in_dims[GetTensorDimIndex(format, 'C')]; @@ -991,6 +1290,33 @@ inline padding_kind TFPaddingToMklDnnPadding(Padding pad) { return padding_kind::zero; } +#ifdef ENABLE_MKLDNN_V1 +/// Helper function to create memory descriptor in Blocked format +/// +/// @input: Tensor dimensions +/// @input: strides corresponding to dimensions. One can use utility +/// function such as CalculateTFStrides to compute strides +/// for given dimensions. +/// @return: memory::desc object corresponding to blocked memory format +/// for given dimensions and strides. +inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim, + const memory::dims& strides, + memory::data_type dtype) { + DCHECK_EQ(dim.size(), strides.size()); + mkldnn_dim_t input_dims[dim.size()]; + mkldnn_dim_t input_strides[dim.size()]; + for (size_t i = 0; i < dim.size(); ++i) { + input_dims[i] = dim[i]; + input_strides[i] = strides[i]; + } + mkldnn_memory_desc_t md; + DCHECK(mkldnn_memory_desc_init_by_strides(&md, dim.size(), input_dims, + memory::convert_to_c(dtype), + input_strides) == 0) + << "Failed to create blocked memory descriptor"; + return memory::desc(md); +} +#else /// Helper function to create memory descriptor in Blocked format /// /// @input: Tensor dimensions @@ -1026,6 +1352,7 @@ inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim, return memory::desc(md); } +#endif template inline primitive FindOrCreateReorder(const memory* from, const memory* to); @@ -1077,6 +1404,21 @@ class MklDnnData { void SetIs3DData(bool bIs3D_) { bIs3D = bIs3D_; } bool GetIs3D() { return bIs3D; } +#ifdef ENABLE_MKLDNN_V1 + /// Set user memory primitive using specified dimensions, memory format tag + /// and data_buffer. Function automatically uses element data type by using + /// input type T used for creating call object. + /// + /// In a nutshell, function allows user to describe the input tensor to + /// an operation. E.g., filter of Conv2D is of shape {1, 2, 3, 4}, and + /// memory format tag HWIO, and the buffer that contains actual values is + /// pointed by data_buffer. + inline void SetUsrMem(const memory::dims& dim, memory::format_tag fm, + void* data_buffer = nullptr) { + auto md = memory::desc(dim, MklDnnType(), fm); + SetUsrMem(md, data_buffer); + } +#else /// Set user memory primitive using specified dimensions, memory format and /// data_buffer. Function automatically uses element data type by using /// input type T used for creating call object. @@ -1090,12 +1432,21 @@ class MklDnnData { auto md = memory::desc(dim, MklDnnType(), fm); SetUsrMem(md, data_buffer); } +#endif +#ifdef ENABLE_MKLDNN_V1 + inline void SetUsrMem(const memory::dims& dim, memory::format_tag fm, + const Tensor* tensor) { + CHECK_NOTNULL(tensor); + SetUsrMem(dim, fm, GetTensorBuffer(tensor)); + } +#else inline void SetUsrMem(const memory::dims& dim, memory::format fm, const Tensor* tensor) { CHECK_NOTNULL(tensor); SetUsrMem(dim, fm, GetTensorBuffer(tensor)); } +#endif /// Helper function to create memory descriptor in Blocked format /// @@ -1129,6 +1480,8 @@ class MklDnnData { SetUsrMem(dim, strides, GetTensorBuffer(tensor)); } +#ifndef ENABLE_MKLDNN_V1 + /// Memory primitive descriptor is deprecated in MKL-DNN v1.0. /// A version of function to set user memory primitive that accepts memory /// descriptor directly, instead of accepting dimensions and format. This /// function is more generic that the one above, but the function above is @@ -1137,6 +1490,7 @@ class MklDnnData { auto pd = memory::primitive_desc(md, *cpu_engine_); SetUsrMem(pd, data_buffer); } +#endif /// A version of SetUsrMem with memory descriptor and tensor inline void SetUsrMem(const memory::desc& md, const Tensor* tensor) { @@ -1144,6 +1498,22 @@ class MklDnnData { SetUsrMem(md, GetTensorBuffer(tensor)); } +#ifdef ENABLE_MKLDNN_V1 + /// A version of function to set user memory type that accepts memory + /// descriptor directly, instead of accepting dimensions and format. This + /// function is more generic than the one above, but the function above is + /// sufficient in most cases. + inline void SetUsrMem(const memory::desc& md, void* data_buffer = nullptr) { + CHECK_NOTNULL(cpu_engine_); + if (user_memory_) delete user_memory_; + // TODO(nhasabni): can we remove dynamic memory allocation? + if (data_buffer) { + user_memory_ = new memory(md, *cpu_engine_, data_buffer); + } else { + user_memory_ = new memory(md, *cpu_engine_); + } + } +#else /// A version of function to set user memory primitive that accepts primitive /// descriptor directly, instead of accepting dimensions and format. This /// function is more generic that the one above, but the function above is @@ -1159,29 +1529,44 @@ class MklDnnData { user_memory_ = new memory(pd); } } +#endif +#ifndef ENABLE_MKLDNN_V1 + /// Memory primitive descriptor is deprecated in MKL-DNN v1.x /// A version of SetUsrMem with primitive descriptor and tensor inline void SetUsrMem(const memory::primitive_desc& pd, const Tensor* tensor) { CHECK_NOTNULL(tensor); SetUsrMem(pd, GetTensorBuffer(tensor)); } +#endif /// Get function for user memory primitive. inline const memory* GetUsrMem() const { return user_memory_; } +#ifndef ENABLE_MKLDNN_V1 + /// Memory primitive descriptor is deprecated in MKL-DNN v1.0. /// Get function for primitive descriptor of user memory primitive. inline const memory::primitive_desc GetUsrMemPrimDesc() const { CHECK_NOTNULL(user_memory_); return user_memory_->get_primitive_desc(); } +#endif +#ifdef ENABLE_MKLDNN_V1 + /// Get function for descriptor of user memory. + inline memory::desc GetUsrMemDesc() const { + CHECK_NOTNULL(user_memory_); + return user_memory_->get_desc(); + } +#else /// Get function for descriptor of user memory. inline memory::desc GetUsrMemDesc() { // This is ugly. Why MKL-DNN does not provide desc() method of const type?? const memory::primitive_desc pd = GetUsrMemPrimDesc(); return const_cast(&pd)->desc(); } +#endif /// Get function for data buffer of user memory primitive. inline void* GetUsrMemDataHandle() const { @@ -1223,6 +1608,16 @@ class MklDnnData { return reorder_memory_ ? *reorder_memory_ : *user_memory_; } +#ifdef ENABLE_MKLDNN_V1 + /// Set memory descriptor of an operation in terms of dimensions and memory + /// format. E.g., For Conv2D, the dimensions would be same as user dimensions + /// but memory::format_tag would be mkldnn::any because we want MKL-DNN to + /// choose the best layout/format for given input dimensions. + inline void SetOpMemDesc(const memory::dims& dim, memory::format_tag fm) { + // TODO(nhasabni): can we remove dynamic memory allocation? + op_md_ = new memory::desc(dim, MklDnnType(), fm); + } +#else /// Set memory descriptor of an operation in terms of dimensions and memory /// format. E.g., For Conv2D, the dimensions would be same as user dimensions /// but memory::format would be mkldnn::any because we want MKL-DNN to choose @@ -1231,10 +1626,22 @@ class MklDnnData { // TODO(nhasabni): can we remove dynamic memory allocation? op_md_ = new memory::desc(dim, MklDnnType(), fm); } +#endif /// Get function for memory descriptor for an operation inline const memory::desc& GetOpMemDesc() const { return *op_md_; } +#ifdef ENABLE_MKLDNN_V1 + /// Predicate that checks if we need to reorder user's memory into memory + /// pointed by op_md. + /// + /// @input: op_md - memory descriptor of the given input of an operation. + /// @return: true in case reorder of input is needed; false, otherwise. + inline bool IsReorderNeeded(const memory::desc& op_md) const { + CHECK_NOTNULL(user_memory_); + return op_md != user_memory_->get_desc(); + } +#else /// Predicate that checks if we need to reorder user's memory into memory /// pointed by op_pd. /// @@ -1245,7 +1652,13 @@ class MklDnnData { CHECK_NOTNULL(user_memory_); return op_pd != user_memory_->get_primitive_desc(); } +#endif +#ifndef ENABLE_MKLDNN_V1 + /// In MKL-DNN v1.0, it it is not possible to directly compare two memory + /// format tags since they only provide a partial description of the memory + /// layout. Hence, this function is disabled for MKL-DNN v1.0. + /// /// Predicate that checks if we need to reorder user's memory into memory /// based on the provided format. /// @@ -1257,6 +1670,7 @@ class MklDnnData { return target_format != user_memory_->get_primitive_desc().desc().data.format; } +#endif /// Function to create a reorder from memory pointed by from to memory pointed /// by to. Returns created primitive. @@ -1266,6 +1680,40 @@ class MklDnnData { return reorder(*from, *to); } +#ifdef ENABLE_MKLDNN_V1 + /// Function to handle input reordering + /// + /// Check if we need to reorder this input of an operation. + /// Return true and allocate reorder memory primitive if reorder is needed. + /// Otherwise, return false and do not allocate reorder memory primitive. + /// + /// To check if reorder is needed, this function compares memory descriptor + /// of an operation (op_md) for the given input with the + /// user-specified memory descriptor. + /// + /// @input: op_md - memory descriptor of the given input of an operation + /// @input: net - net to which to add reorder primitive in case it is needed. + /// @input: net_args - net to which user and reorder memories are added if + /// needed. Each entry is a key-value pair of the form + /// . + /// @return: true in case reorder of input is needed; false, otherwise. + inline bool CheckReorderToOpMem(const memory::desc& op_md, + std::vector& net, + std::vector& net_args, + const engine& engine) { + CHECK_NOTNULL(user_memory_); + DCHECK_EQ(net.size(), net_args.size()); + if (IsReorderNeeded(op_md)) { + // TODO(nhasabni): can we remove dynamic memory allocation? + reorder_memory_ = new memory(op_md, engine); + net.push_back(CreateReorder(user_memory_, reorder_memory_)); + net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *user_memory_}, + {MKLDNN_ARG_TO, *reorder_memory_}}); + return true; + } + return false; + } +#else /// Function to handle input reordering /// /// Check if we need to reorder this input of an operation. @@ -1292,7 +1740,29 @@ class MklDnnData { } return false; } +#endif +#ifdef ENABLE_MKLDNN_V1 + /// TODO(bhavanis): Need to use reorder cache here for better performance. + /// TODO: this is a faster path with reorder primitive cache compared with + /// CheckReorderToOpMem(..., std::vector* net). + /// TODO(gzmkl): Remove the slower path. + inline bool CheckReorderToOpMem(const memory::desc& op_md, + const engine& engine) { + CHECK_NOTNULL(user_memory_); + if (IsReorderNeeded(op_md)) { + // TODO(nhasabni): can we remove dynamic memory allocation? + // primitive reuse don't allow two same reorder prim in + // one stream, so submit it immediately + reorder_memory_ = new memory(op_md, engine); + stream cpu_stream(engine); + reorder(*user_memory_, *reorder_memory_) + .execute(cpu_stream, *user_memory_, *reorder_memory_); + return true; + } + return false; + } +#else /// This is a faster path with reorder primitive cache compared with /// CheckReorderToOpMem(..., std::vector* net). /// TODO(gzmkl): Remove the slower path. @@ -1310,7 +1780,40 @@ class MklDnnData { } return false; } +#endif +#ifdef ENABLE_MKLDNN_V1 + /// Overloaded version of above function that accepts memory buffer + /// where output of reorder needs to be stored. + /// + /// @input: op_md - memory descriptor of the given input of an operation + /// @reorder_data_handle - memory buffer where output of reorder needs to be + /// stored. Primitive does not check if buffer has + /// enough size to write. + /// @input: net - net to which to add reorder primitive in case it is needed. + /// @input: net_args - net to which user and reorder memories are added if + /// needed. Each entry is a key-value pair of the form + /// . + /// @input: engine - MKL-DNN's abstraction of a computational device + /// @return: true in case reorder of input is needed; false, otherwise. + inline bool CheckReorderToOpMem(const memory::desc& op_md, + void* reorder_data_handle, + std::vector& net, + std::vector& net_args, + const engine& engine) { + CHECK_NOTNULL(reorder_data_handle); + CHECK_NOTNULL(user_memory_); + if (IsReorderNeeded(op_md)) { + // TODO(nhasabni): can we remove dynamic memory allocation? + reorder_memory_ = new memory(op_md, engine, reorder_data_handle); + net.push_back(CreateReorder(user_memory_, reorder_memory_)); + net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *user_memory_}, + {MKLDNN_ARG_TO, *reorder_memory_}}); + return true; + } + return false; + } +#else /// Overloaded version of above function that accepts memory buffer /// where output of reorder needs to be stored. /// @@ -1335,7 +1838,28 @@ class MklDnnData { } return false; } +#endif +#ifdef ENABLE_MKLDNN_V1 + /// TODO(bhavanis): Need to use reorder cache here for better performance. + inline bool CheckReorderToOpMem(const memory::desc& op_md, + void* reorder_data_handle, + const engine& engine) { + CHECK_NOTNULL(reorder_data_handle); + CHECK_NOTNULL(user_memory_); + if (IsReorderNeeded(op_md)) { + // TODO(nhasabni): can we remove dynamic memory allocation? + // primitive reuse don't allow two same reorder prim in + // one stream, so submit it immediately + reorder_memory_ = new memory(op_md, engine, reorder_data_handle); + stream cpu_stream(engine); + reorder(*user_memory_, *reorder_memory_) + .execute(cpu_stream, *user_memory_, *reorder_memory_); + return true; + } + return false; + } +#else /// This is a faster path with reorder primitive cache compared with /// CheckReorderToOpMem(..., std::vector* net). /// The slower path will be removed in the future @@ -1355,7 +1879,32 @@ class MklDnnData { } return false; } +#endif +#ifdef ENABLE_MKLDNN_V1 + /// Another overloaded version of CheckReorderToOpMem that accepts Tensor + /// where output of reorder needs to be stored. + /// + /// @input: op_md - memory descriptor of the given input of an operation + /// @reorder_tensor - Tensor whose buffer is to be used to store output of + /// reorder. Primitive does not check if buffer is + /// enough size to write. + /// @input: net - net to which to add reorder primitive in case it is needed. + /// @input: net_args - net to which user and reorder memories are added if + /// needed. Each entry is a key-value pair of the form + /// . + /// @input: engine - MKL-DNN's abstraction of a computational device + /// @return: true in case reorder of input is needed; false, otherwise. + inline bool CheckReorderToOpMem(const memory::desc& op_md, + Tensor* reorder_tensor, + std::vector& net, + std::vector& net_args, + const engine& engine) { + CHECK_NOTNULL(reorder_tensor); + return CheckReorderToOpMem(op_md, GetTensorBuffer(reorder_tensor), net, + net_args, engine); + } +#else /// Another overloaded version of CheckReorderToOpMem that accepts Tensor /// where output of reorder needs to be stored. /// @@ -1373,7 +1922,20 @@ class MklDnnData { CHECK_NOTNULL(reorder_tensor); return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor), net); } +#endif +#ifdef ENABLE_MKLDNN_V1 + /// TODO: this is a faster path with reorder primitive cache compared with + /// CheckReorderToOpMem(op_md, reorder_tensor, net, net_args, engine), will + /// remove + /// slow path in the future + inline bool CheckReorderToOpMem(const memory::desc& op_md, + Tensor* reorder_tensor) { + CHECK_NOTNULL(reorder_tensor); + return CheckReorderToOpMem(op_md, GetTensorBuffer(reorder_tensor), + *cpu_engine_); + } +#else /// TODO: this is a faster path with reorder primitive cache compared with /// CheckReorderToOpMem(..., std::vector* net), will remove /// slow path in the future @@ -1382,7 +1944,31 @@ class MklDnnData { CHECK_NOTNULL(reorder_tensor); return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor)); } +#endif +#ifdef ENABLE_MKLDNN_V1 + /// Function to handle output reorder + /// + /// This function performs very similar functionality as input reordering + /// function above. The only difference is that this function does not add + /// reorder primitive to the net. The reason for this is: the reorder + /// primitive for output needs to be added to the list only after operation + /// has executed. But we need to prepare a temporary buffer in case output + /// reorder is needed. And this temporary buffer will hold the output of + /// an operation before it is fed to reorder primitive. + /// + /// @input memory descriptor for the given output of an operation + /// @return: true in case reorder of output is needed; false, otherwise. + inline bool PrepareReorderToUserMemIfReq(const memory::desc& op_md) { + CHECK_NOTNULL(user_memory_); + if (IsReorderNeeded(op_md)) { + // TODO(nhasabni): can we remove dynamic memory allocation? + reorder_memory_ = new memory(op_md, *cpu_engine_); + return true; + } + return false; + } +#else /// Function to handle output reorder /// /// This function performs very similar functionality as input reordering @@ -1405,7 +1991,28 @@ class MklDnnData { } return false; } +#endif +#ifdef ENABLE_MKLDNN_V1 + /// Function to actually insert reorder primitive in the net + /// + /// This function completes remaining part of output reordering. It inserts + /// a reordering primitive from the temporary buffer that holds the output + /// to the user-specified output buffer. + /// + /// @input: net - net to which to add reorder primitive + /// @input: net_args - net to which user and reorder memories are added if + /// needed. Each entry is a key-value pair of the form + /// . + inline void InsertReorderToUserMem(std::vector& net, + std::vector& net_args) { + CHECK_NOTNULL(user_memory_); + CHECK_NOTNULL(reorder_memory_); + net.push_back(CreateReorder(reorder_memory_, user_memory_)); + net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *reorder_memory_}, + {MKLDNN_ARG_TO, *user_memory_}}); + } +#else /// Function to actually insert reorder primitive in the net /// /// This function completes remaining part of output reordering. It inserts @@ -1419,7 +2026,31 @@ class MklDnnData { CHECK_NOTNULL(reorder_memory_); net->push_back(CreateReorder(reorder_memory_, user_memory_)); } +#endif +#ifdef ENABLE_MKLDNN_V1 + /// TODO: this is a faster path with reorder primitive cache compared with + /// InsertReorderToUserMem(net, net_args), will remove + /// slow path in the future + inline void InsertReorderToUserMem() { + CHECK_NOTNULL(user_memory_); + CHECK_NOTNULL(reorder_memory_); + CHECK_NOTNULL(cpu_engine_); + stream cpu_stream(cpu_engine_); + // primitive reuse don't allow two same reorder prim in + // one stream, so submit it immediately + std::vector net; + std::vector net_args; + net.push_back(FindOrCreateReorder(reorder_memory_, user_memory_)); + net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *reorder_memory_}, + {MKLDNN_ARG_TO, *user_memory_}}); + DCHECK_EQ(net.size(), net_args.size()); + for (size_t i = 0; i < net.size(); ++i) { + net.at(i).execute(cpu_stream, net_args.at(i)); + } + cpu_stream.wait(); + } +#else /// TODO: this is a faster path with reorder primitive cache compared with /// InsertReorderToUserMem(std::vector* net), will remove /// slow path in the future @@ -1432,6 +2063,7 @@ class MklDnnData { net.push_back(FindOrCreateReorder(reorder_memory_, user_memory_)); stream(stream::kind::eager).submit(net).wait(); } +#endif }; /// Base class for operations with reuse of primitives @@ -1624,6 +2256,25 @@ class FactoryKeyCreator { } }; +#ifdef ENABLE_MKLDNN_V1 +static inline memory::format_tag get_desired_format(int channel, + bool is_2d = true) { + memory::format_tag fmt_desired = memory::format_tag::any; + + if (port::TestCPUFeature(port::CPUFeature::AVX512F)) { + fmt_desired = + is_2d ? memory::format_tag::nChw16c : memory::format_tag::nCdhw16c; + } else if (port::TestCPUFeature(port::CPUFeature::AVX2) && + (channel % 8) == 0) { + fmt_desired = + is_2d ? memory::format_tag::nChw8c + : memory::format_tag::ncdhw; // no avx2 support for 3d yet. + } else { + fmt_desired = is_2d ? memory::format_tag::nchw : memory::format_tag::ncdhw; + } + return fmt_desired; +} +#else static inline memory::format get_desired_format(int channel, bool is_2d = true) { memory::format fmt_desired = memory::format::any; @@ -1639,6 +2290,7 @@ static inline memory::format get_desired_format(int channel, } return fmt_desired; } +#endif class MklReorderPrimitive : public MklPrimitive { public: @@ -1663,8 +2315,21 @@ class MklReorderPrimitive : public MklPrimitive { : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {} } context_; +#ifdef ENABLE_MKLDNN_V1 + engine cpu_engine_ = engine(engine::kind::cpu, 0); +#else engine cpu_engine_ = engine(engine::cpu, 0); +#endif +#ifdef ENABLE_MKLDNN_V1 + void Setup(const memory* from, const memory* to) { + context_.src_mem.reset( + new memory(from->get_desc(), cpu_engine_, DummyData)); + context_.dst_mem.reset(new memory(to->get_desc(), cpu_engine_, DummyData)); + context_.reorder_prim = std::make_shared( + reorder(*context_.src_mem, *context_.dst_mem)); + } +#else void Setup(const memory* from, const memory* to) { context_.src_mem.reset(new memory( {from->get_primitive_desc().desc(), cpu_engine_}, DummyData)); @@ -1673,6 +2338,7 @@ class MklReorderPrimitive : public MklPrimitive { context_.reorder_prim = std::make_shared( reorder(*context_.src_mem, *context_.dst_mem)); } +#endif }; template @@ -1699,6 +2365,32 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory { MklReorderPrimitiveFactory() {} ~MklReorderPrimitiveFactory() {} +#ifdef ENABLE_MKLDNN_V1 + static string CreateKey(const memory* from, const memory* to) { + string prefix = "reorder"; + FactoryKeyCreator key_creator; + auto const& from_desc = from->get_desc().data; + auto const& to_desc = to->get_desc().data; + const int KIdxFirstStride = 0; + memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]); + memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]); + memory::dims from_strides( + from_desc.format_desc.blocking.strides, + &from_desc.format_desc.blocking.strides[from_desc.ndims]); + memory::dims to_strides( + to_desc.format_desc.blocking.strides, + &to_desc.format_desc.blocking.strides[to_desc.ndims]); + key_creator.AddAsKey(prefix); + // `format_kind` is not added since it will always set to `mkldnn_blocked` + key_creator.AddAsKey(static_cast(from_desc.data_type)); + key_creator.AddAsKey(from_dims); + key_creator.AddAsKey(from_strides); + key_creator.AddAsKey(static_cast(to_desc.data_type)); + key_creator.AddAsKey(to_dims); + key_creator.AddAsKey(to_strides); + return key_creator.GetKey(); + } +#else static string CreateKey(const memory* from, const memory* to) { string prefix = "reorder"; FactoryKeyCreator key_creator; @@ -1725,6 +2417,7 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory { key_creator.AddAsKey(to_strides); return key_creator.GetKey(); } +#endif MklPrimitive* GetReorder(const memory* from, const memory* to) { string key = CreateKey(from, to); From e7c6533b7d3f1997bfabe9043210845f016ab688 Mon Sep 17 00:00:00 2001 From: amoitra Date: Tue, 9 Jul 2019 14:40:29 -0700 Subject: [PATCH 0074/3053] Incorporate Thomas's comments --- .../compiler/xla/service/gpu/cudnn_conv_rewriter.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc index 21ef810e64b..ca8d63cbcc7 100755 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc @@ -251,16 +251,17 @@ MatchBackwardFilter(HloInstruction* conv) { return std::make_tuple(true, backward_conv_window, backward_conv_dnums, lhs); } - Shape new_shape = lhs->shape(); int64 input_batch_dimension = backward_conv_dnums.input_batch_dimension(); int64 input_feature_dimension = backward_conv_dnums.input_feature_dimension(); - int64 input_batch = new_shape.dimensions(input_batch_dimension); - int64 input_feature = new_shape.dimensions(input_feature_dimension); - + int64 input_batch = lhs->shape().dimensions(input_batch_dimension); // Ensure that input_batch is exact multiple of conv->feature_group_count() - CHECK_EQ(input_batch % conv->feature_group_count(), 0); + CHECK_EQ(input_batch % conv->feature_group_count(), 0) + << "Input batch should be an exact multiple of feature group count"; + int64 input_feature = lhs->shape().dimensions(input_feature_dimension); + + Shape new_shape = lhs->shape(); new_shape.set_dimensions(input_batch_dimension, input_batch / conv->feature_group_count()); new_shape.set_dimensions(input_feature_dimension, From 7dec5009480a2d04962d09a2e62e1253952a9745 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 9 Jul 2019 22:40:44 +0000 Subject: [PATCH 0075/3053] Update to address review comments Signed-off-by: Yong Tang --- tensorflow/python/keras/engine/training_arrays.py | 5 +++-- .../python/keras/engine/training_arrays_test.py | 14 ++++++-------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py index 206c8aefdb2..cca8f1bd157 100644 --- a/tensorflow/python/keras/engine/training_arrays.py +++ b/tensorflow/python/keras/engine/training_arrays.py @@ -35,6 +35,7 @@ from tensorflow.python.keras.utils.generic_utils import make_batches from tensorflow.python.keras.utils.generic_utils import slice_arrays from tensorflow.python.keras.utils.mode_keys import ModeKeys from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.util import nest try: from scipy.sparse import issparse # pylint: disable=g-import-not-at-top @@ -207,8 +208,8 @@ def model_iteration(model, val_samples_or_steps = validation_steps else: # Get num samples for printing. - vals = val_inputs.values() if isinstance(val_inputs, dict) else val_inputs - val_samples_or_steps = vals and vals[0].shape[0] or None + val_samples_or_steps = val_inputs and nest.flatten( + val_inputs)[0].shape[0] or None if mode == ModeKeys.TRAIN and verbose: _print_train_info(num_samples_or_steps, val_samples_or_steps, is_dataset) diff --git a/tensorflow/python/keras/engine/training_arrays_test.py b/tensorflow/python/keras/engine/training_arrays_test.py index 943fc0d343e..0d145b9c947 100644 --- a/tensorflow/python/keras/engine/training_arrays_test.py +++ b/tensorflow/python/keras/engine/training_arrays_test.py @@ -110,7 +110,7 @@ class PrintTrainingInfoTest(parameterized.TestCase): if do_validation: self.assertIn(", validate on 50 samples", mock_stdout.getvalue()) - def test_dict_input(self): + def test_dict_validation_input(self): """Test case for GitHub issue 30122.""" train_input_0 = np.random.rand(1000, 1) train_input_1 = np.random.rand(1000, 1) @@ -139,13 +139,11 @@ class PrintTrainingInfoTest(parameterized.TestCase): model = my_model() model.compile(loss="mae", optimizer="adam") - mock_stdout = six.StringIO() - with test.mock.patch.object(sys, "stdout", mock_stdout): - model.fit( - x={'input_0': train_input_0, 'input_1': train_input_1}, - y=train_labels, - validation_data=( - {'input_0': val_input_0, 'input_1': val_input_1}, val_labels)) + model.fit( + x={'input_0': train_input_0, 'input_1': train_input_1}, + y=train_labels, + validation_data=( + {'input_0': val_input_0, 'input_1': val_input_1}, val_labels)) if __name__ == "__main__": From 14b14ab32dd3d07f7e0a7d375a6b6d68a6831ccd Mon Sep 17 00:00:00 2001 From: Bhavani Subramanian Date: Tue, 9 Jul 2019 15:54:04 -0700 Subject: [PATCH 0076/3053] Enabled Conv2D fprop for MKL-DNN v1.0. --- tensorflow/core/graph/mkl_layout_pass.cc | 34 +- tensorflow/core/kernels/mkl_conv_ops.cc | 468 ++++++++++++++++++++++- tensorflow/core/kernels/mkl_conv_ops.h | 40 +- 3 files changed, 511 insertions(+), 31 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index df3cf19e2c0..7ec8e3eea32 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -351,9 +351,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass { csinfo_.mul = "Mul"; csinfo_.squared_difference = "SquaredDifference"; csinfo_.sub = "Sub"; - // End - element-wise ops. See note above. +// End - element-wise ops. See note above. - // NOTE: names are alphabetically sorted. +// NOTE: names are alphabetically sorted. +#ifndef ENABLE_MKLDNN_V1 rinfo_.push_back({csinfo_.addn, mkl_op_registry::GetMklOpName(csinfo_.addn), CopyAttrsAddN, AlwaysRewrite, kRewriteForLayoutPropagation}); @@ -388,10 +389,12 @@ class MklLayoutRewritePass : public GraphOptimizationPass { {csinfo_.conjugate_transpose, mkl_op_registry::GetMklOpName(csinfo_.conjugate_transpose), CopyAttrsTranspose, AlwaysRewrite, kRewriteForOpNameChange}); +#endif // ENABLE_MKLDNN_V1 rinfo_.push_back({csinfo_.conv2d, mkl_op_registry::GetMklOpName(csinfo_.conv2d), CopyAttrsConvCheckConstFilter, AlwaysRewrite, kRewriteForLayoutPropagation}); +#ifndef ENABLE_MKLDNN_V1 rinfo_.push_back({csinfo_.conv2d_with_bias, csinfo_.mkl_conv2d_with_bias, CopyAttrsConvCheckConstFilter, AlwaysRewrite, kRewriteForLayoutPropagation}); @@ -632,18 +635,20 @@ class MklLayoutRewritePass : public GraphOptimizationPass { rinfo_.push_back( {csinfo_.requantize, mkl_op_registry::GetMklOpName(csinfo_.requantize), CopyAttrsRequantize, AlwaysRewrite, kRewriteForLayoutPropagation}); - // Disable these two MKL operators for now due to some test failures caused - // by these two ops - /* - rinfo_.push_back({csinfo_.tanh, - mkl_op_registry::GetMklOpName(csinfo_.tanh), - CopyAttrsDataType, AlwaysRewrite, - kRewriteForLayoutPropagation}); - rinfo_.push_back({csinfo_.tanh_grad, - mkl_op_registry::GetMklOpName(csinfo_.tanh_grad), - CopyAttrsDataType, AlwaysRewrite, - kRewriteForLayoutPropagation}); - */ +#endif // ENABLE_MKLDNN_V1 +// Disable these two MKL operators for now due to some test failures caused +// by these two ops +/* +rinfo_.push_back({csinfo_.tanh, + mkl_op_registry::GetMklOpName(csinfo_.tanh), + CopyAttrsDataType, AlwaysRewrite, + kRewriteForLayoutPropagation}); +rinfo_.push_back({csinfo_.tanh_grad, + mkl_op_registry::GetMklOpName(csinfo_.tanh_grad), + CopyAttrsDataType, AlwaysRewrite, + kRewriteForLayoutPropagation}); +*/ +#ifndef ENABLE_MKLDNN_V1 rinfo_.push_back( {csinfo_.reshape, mkl_op_registry::GetMklOpName(csinfo_.reshape), CopyAttrsReshape, AlwaysRewrite, kRewriteForLayoutPropagation}); @@ -744,6 +749,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // CheckForMklOp FuseConv3D, CopyAttrsConv}); +#endif // ENABLE_MKLDNN_V1 } // Standard interface to run pass diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index 14344da0560..39cc4da3ce0 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -24,8 +24,8 @@ limitations under the License. #include #include -#include "mkldnn.hpp" #include "absl/strings/str_join.h" +#include "mkldnn.hpp" #include "tensorflow/core/framework/bounds_check.h" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -50,7 +50,9 @@ limitations under the License. using mkldnn::prop_kind; using mkldnn::stream; using mkldnn::convolution_forward; +#ifndef ENABLE_MKLDNN_V1 using mkldnn::convolution_direct; +#endif namespace tensorflow { @@ -93,6 +95,16 @@ typedef mkldnn::convolution_forward::primitive_desc ConvFwdPd; template class MklConvFwdPrimitive : public MklPrimitive { public: +#ifdef ENABLE_MKLDNN_V1 + explicit MklConvFwdPrimitive(const MklConvFwdParams& convFwdDims) + : cpu_engine_(engine::kind::cpu, 0) { + context_.fwd_stream.reset(new stream(cpu_engine_)); + // Create conv primitive + if (context_.conv_fwd == nullptr) { + Setup(convFwdDims); + } + } +#else explicit MklConvFwdPrimitive(const MklConvFwdParams& convFwdDims) : cpu_engine_(engine::cpu, 0) { context_.fwd_stream.reset(new stream(stream::kind::eager)); @@ -101,6 +113,7 @@ class MklConvFwdPrimitive : public MklPrimitive { Setup(convFwdDims); } } +#endif ~MklConvFwdPrimitive() {} @@ -119,7 +132,16 @@ class MklConvFwdPrimitive : public MklPrimitive { static_cast(const_cast(bias_data))); context_.dst_mem->set_data_handle( static_cast(const_cast(dst_data))); +#ifdef ENABLE_MKLDNN_V1 + CHECK_EQ(context_.fwd_primitives.size(), + context_.fwd_primitives_args.size()); + for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) { + context_.fwd_primitives.at(i).execute(*context_.fwd_stream, + context_.fwd_primitives_args.at(i)); + } +#else context_.fwd_stream->submit(context_.fwd_primitives); +#endif // After exec, set data handle back context_.src_mem->set_data_handle(DummyData); @@ -142,7 +164,16 @@ class MklConvFwdPrimitive : public MklPrimitive { static_cast(const_cast(filter_data))); context_.dst_mem->set_data_handle( static_cast(const_cast(dst_data))); +#ifdef ENABLE_MKLDNN_V1 + CHECK_EQ(context_.fwd_primitives.size(), + context_.fwd_primitives_args.size()); + for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) { + context_.fwd_primitives.at(i).execute(*context_.fwd_stream, + context_.fwd_primitives_args.at(i)); + } +#else context_.fwd_stream->submit(context_.fwd_primitives); +#endif // After execution, set data handle back context_.src_mem->set_data_handle(DummyData); @@ -150,9 +181,13 @@ class MklConvFwdPrimitive : public MklPrimitive { context_.dst_mem->set_data_handle(DummyData); } +#ifndef ENABLE_MKLDNN_V1 + // In MKL-DNN v1.0, memory format tags only provide a partial description + // of the memory layout. Hence, these functions are disabled for v1.0. memory::format GetSrcMemoryFormat() const { return context_.src_fmt; } memory::format GetFilterMemoryFormat() const { return context_.filter_fmt; } +#endif std::shared_ptr GetPrimitiveDesc() const { return context_.fwd_pd; @@ -161,9 +196,11 @@ class MklConvFwdPrimitive : public MklPrimitive { private: // Primitive reuse context for Conv2D Fwd op struct ConvFwdContext { +#ifndef ENABLE_MKLDNN_V1 // Expected memory format for this primitive instance memory::format src_fmt; memory::format filter_fmt; +#endif // MKLDNN memory std::shared_ptr src_mem; @@ -187,9 +224,16 @@ class MklConvFwdPrimitive : public MklPrimitive { std::shared_ptr fwd_stream; std::vector fwd_primitives; +#ifdef ENABLE_MKLDNN_V1 + std::vector> fwd_primitives_args; +#endif + ConvFwdContext() - : src_fmt(memory::format::any), + : +#ifndef ENABLE_MKLDNN_V1 + src_fmt(memory::format::any), filter_fmt(memory::format::any), +#endif src_mem(nullptr), filter_mem(nullptr), bias_mem(nullptr), @@ -200,34 +244,64 @@ class MklConvFwdPrimitive : public MklPrimitive { bias_md(nullptr), fwd_pd(nullptr), conv_fwd(nullptr), - fwd_stream(nullptr) {} + fwd_stream(nullptr) { + } }; void Setup(const MklConvFwdParams& convFwdDims) { // Create memory descriptors for convolution data w/ no specified format context_.src_md.reset(new memory::desc( +#ifdef ENABLE_MKLDNN_V1 + {convFwdDims.src_dims}, MklDnnType(), memory::format_tag::any)); +#else {convFwdDims.src_dims}, MklDnnType(), memory::format::any)); +#endif context_.filter_md.reset(new memory::desc( +#ifdef ENABLE_MKLDNN_V1 + {convFwdDims.filter_dims}, MklDnnType(), + memory::format_tag::any)); +#else {convFwdDims.filter_dims}, MklDnnType(), memory::format::any)); +#endif context_.dst_md.reset(new memory::desc( +#ifdef ENABLE_MKLDNN_V1 + {convFwdDims.dst_dims}, MklDnnType(), + memory::format_tag::any)); +#else {convFwdDims.dst_dims}, MklDnnType(), memory::format::any)); +#endif if (!convFwdDims.bias_dims.empty()) context_.bias_md.reset(new memory::desc( +#ifdef ENABLE_MKLDNN_V1 + {convFwdDims.bias_dims}, MklDnnType(), + memory::format_tag::any)); +#else {convFwdDims.bias_dims}, MklDnnType(), memory::format::any)); +#endif // Create a convolution if (!convFwdDims.bias_dims.empty()) { context_.fwd_desc.reset(new convolution_forward::desc( +#ifdef ENABLE_MKLDNN_V1 + prop_kind::forward, mkldnn::algorithm::convolution_direct, + *context_.src_md, +#else prop_kind::forward, convolution_direct, *context_.src_md, +#endif *context_.filter_md, *context_.bias_md, *context_.dst_md, convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left, convFwdDims.padding_right, padding_kind::zero)); } else { context_.fwd_desc.reset(new convolution_forward::desc( +#ifdef ENABLE_MKLDNN_V1 + prop_kind::forward, mkldnn::algorithm::convolution_direct, + *context_.src_md, +#else prop_kind::forward, convolution_direct, *context_.src_md, +#endif *context_.filter_md, *context_.dst_md, convFwdDims.strides, convFwdDims.dilations, convFwdDims.padding_left, convFwdDims.padding_right, padding_kind::zero)); @@ -246,7 +320,12 @@ class MklConvFwdPrimitive : public MklPrimitive { float op_scale = post_op_param.param[0]; float op_alpha = post_op_param.param[1]; float op_beta = post_op_param.param[2]; +#ifdef ENABLE_MKLDNN_V1 + post_ops.append_eltwise(op_scale, mkldnn::algorithm::eltwise_relu, + op_alpha, +#else post_ops.append_eltwise(op_scale, post_op_param.alg, op_alpha, +#endif op_beta); } else if (post_op_param.name == "sum") { DCHECK_EQ(post_op_param.param.size(), 1); @@ -271,21 +350,54 @@ class MklConvFwdPrimitive : public MklPrimitive { context_.fwd_pd.reset(new ConvFwdPd(*context_.fwd_desc, cpu_engine_)); } +#ifndef ENABLE_MKLDNN_V1 // Store the expected memory format context_.src_fmt = static_cast( context_.fwd_pd.get()->src_primitive_desc().desc().data.format); context_.filter_fmt = static_cast( context_.fwd_pd.get()->weights_primitive_desc().desc().data.format); +#endif +#ifdef ENABLE_MKLDNN_V1 // Create memory primitive based on dummy data + context_.src_mem.reset( + new memory(context_.fwd_pd.get()->src_desc(), cpu_engine_, DummyData)); + context_.filter_mem.reset(new memory(context_.fwd_pd.get()->weights_desc(), + cpu_engine_, DummyData)); + context_.dst_mem.reset( + new memory(context_.fwd_pd.get()->dst_desc(), cpu_engine_, DummyData)); +#else context_.src_mem.reset( new memory(context_.fwd_pd.get()->src_primitive_desc(), DummyData)); context_.filter_mem.reset( new memory(context_.fwd_pd.get()->weights_primitive_desc(), DummyData)); context_.dst_mem.reset( new memory(context_.fwd_pd.get()->dst_primitive_desc(), DummyData)); +#endif +#ifdef ENABLE_MKLDNN_V1 + // Create convolution primitive and add it to net + if (!convFwdDims.bias_dims.empty()) { + context_.bias_mem.reset(new memory( + {{convFwdDims.bias_dims}, MklDnnType(), memory::format_tag::x}, + cpu_engine_, DummyData)); + context_.conv_fwd.reset(new convolution_forward(*context_.fwd_pd)); + context_.fwd_primitives_args.push_back( + {{MKLDNN_ARG_SRC, *context_.src_mem}, + {MKLDNN_ARG_WEIGHTS, *context_.filter_mem}, + {MKLDNN_ARG_BIAS, *context_.bias_mem}, + {MKLDNN_ARG_DST, *context_.dst_mem}}); + } else { + context_.conv_fwd.reset(new convolution_forward(*context_.fwd_pd)); + context_.fwd_primitives_args.push_back( + {{MKLDNN_ARG_SRC, *context_.src_mem}, + {MKLDNN_ARG_WEIGHTS, *context_.filter_mem}, + {MKLDNN_ARG_DST, *context_.dst_mem}}); + } + context_.fwd_primitives.push_back(*context_.conv_fwd); + return; +#else // Create convolution primitive and add it to net if (!convFwdDims.bias_dims.empty()) { context_.bias_mem.reset(new memory( @@ -303,6 +415,7 @@ class MklConvFwdPrimitive : public MklPrimitive { context_.fwd_primitives.push_back(*context_.conv_fwd); return; +#endif } struct ConvFwdContext context_; @@ -450,17 +563,15 @@ class MklConvOp : public OpKernel { OP_REQUIRES(context, dilations_.size() == 5, errors::InvalidArgument("Dilation rates field must " "specify 5 dimensions")); - OP_REQUIRES(context, - (GetTensorDim(dilations_, data_format_, 'N') == 1 && - GetTensorDim(dilations_, data_format_, 'C') == 1), + OP_REQUIRES(context, (GetTensorDim(dilations_, data_format_, 'N') == 1 && + GetTensorDim(dilations_, data_format_, 'C') == 1), errors::InvalidArgument( "Current implementation does not yet support " "dilations rates in the batch and depth dimensions.")); OP_REQUIRES( - context, - (GetTensorDim(dilations_, data_format_, '0') > 0 && - GetTensorDim(dilations_, data_format_, '1') > 0 && - GetTensorDim(dilations_, data_format_, '2') > 0), + context, (GetTensorDim(dilations_, data_format_, '0') > 0 && + GetTensorDim(dilations_, data_format_, '1') > 0 && + GetTensorDim(dilations_, data_format_, '2') > 0), errors::InvalidArgument("Dilated rates should be larger than 0.")); } } @@ -566,6 +677,12 @@ class MklConvOp : public OpKernel { auto tf_fmt = is_conv2d ? TFDataFormatToMklDnnDataFormat(data_format_) : TFDataFormatToMklDnn3DDataFormat(data_format_); +#ifdef ENABLE_MKLDNN_V1 + auto mkl_fmt_tag = MklTensorFormatToMklDnnDataFormat(tf_fmt); + // NOTE: `mkl_fmt_tag` will be `format_tag::undef` for ReLU + CHECK_NE(mkl_fmt_tag, memory::format_tag::undef); +#endif + // If input is in MKL layout, then simply grab the layout; otherwise, // construct TF layout for input. // For constructing TF layout for input, although input shape (src_dims) @@ -573,18 +690,28 @@ class MklConvOp : public OpKernel { // TF layout depending on the data format: // Conv2D: NHWC or NCHW // Conv3D: NDHWC or NCDHW - auto src_md = src_mkl_shape.IsMklTensor() - ? src_mkl_shape.GetMklLayout() - : memory::desc(src_dims, MklDnnType(), tf_fmt); + auto src_md = + src_mkl_shape.IsMklTensor() + ? src_mkl_shape.GetMklLayout() +#ifdef ENABLE_MKLDNN_V1 + : memory::desc(src_dims, MklDnnType(), mkl_fmt_tag); +#else + : memory::desc(src_dims, MklDnnType(), tf_fmt); +#endif src.SetUsrMem(src_md, &src_tensor); +#ifdef ENABLE_MKLDNN_V1 // Although filter shape (filter_dims) required is in MKL-DNN order, // the layout is Tensorflow's layout (HWIO) and (HWIGO) for // depthwise/group convolutions. - + auto filter_format = is_conv2d ? (is_depthwise ? memory::format_tag::hwigo + : memory::format_tag::hwio) + : memory::format_tag::dhwio; +#else auto filter_format = is_conv2d ? (is_depthwise ? memory::format::hwigo : memory::format::hwio) : memory::format::dhwio; +#endif DCHECK(!filter_mkl_shape.IsMklTensor()); auto filter_md = @@ -643,6 +770,51 @@ class MklConvOp : public OpKernel { // Check whether src and filter need to be reordered Tinput* src_data = nullptr; +#ifdef ENABLE_MKLDNN_V1 + if (src_md != conv_fwd_pd->src_desc()) { + // Reorder src + src.SetUsrMem(src_md, &src_tensor); + src.CheckReorderToOpMem(conv_fwd_pd->src_desc(), cpu_engine_); + src_data = static_cast(src.GetOpMem().get_data_handle()); + } else { + src_data = static_cast( + const_cast(src_tensor.flat().data())); + } + + Tfilter* filter_data = nullptr; + if (filter_md != conv_fwd_pd->weights_desc()) { + bool is_filter_cached = false; + // If filter is a constant, we can avoid the conversion of filter from + // Tensorflow format to MKL format by caching the filter when it is + // converted for the first time. This cached filter can then be reused + // in subsequent iterations. + if (is_filter_const_) { + if (IsFilterCacheEmpty(context)) { + // Cache filter if it is not already cached. + CacheFilter(context, conv_fwd_pd, filter_data, filter_tensor, + filter, filter_md, filter_mkl_shape); + } + filter_data = GetCachedFilter(context, conv_fwd_pd->weights_desc()); + is_filter_cached = (filter_data != nullptr); + } + if (!is_filter_cached) { + filter.SetUsrMem(filter_md, &filter_tensor); + if (filter_out_tensor == nullptr) { + filter.CheckReorderToOpMem(conv_fwd_pd->weights_desc(), + cpu_engine_); + } else { + filter.CheckReorderToOpMem( + conv_fwd_pd->weights_desc(), + filter.GetTensorBuffer(filter_out_tensor), cpu_engine_); + } + filter_data = + static_cast(filter.GetOpMem().get_data_handle()); + } + } else { + filter_data = static_cast( + const_cast(filter_tensor.flat().data())); + } +#else if (src_md.data.format != conv_fwd->GetSrcMemoryFormat()) { // Reorder src src.SetUsrMem(src_md, &src_tensor); @@ -687,6 +859,7 @@ class MklConvOp : public OpKernel { filter_data = static_cast( const_cast(filter_tensor.flat().data())); } +#endif // Execute convolution if (fuse_biasadd_) { @@ -805,6 +978,35 @@ class MklConvOp : public OpKernel { return nullptr; } +#ifdef ENABLE_MKLDNN_V1 + virtual void AllocateOutputTensor(OpKernelContext* context, + const ConvFwdPd& conv_prim_desc, + const memory::dims& output_dims_mkl_order, + MklTensorFormat output_tf_format, + Tensor** output_tensor) { + CHECK_NOTNULL(output_tensor); + auto dst_md = conv_prim_desc.dst_desc(); + + if (!std::is_same::value) { + dst_md.data.data_type = + static_cast(MklDnnType()); + } + // Allocate shape of Mkl tensor. + MklDnnShape output_mkl_shape; + output_mkl_shape.SetMklTensor(true); + output_mkl_shape.SetMklLayout(&dst_md); + output_mkl_shape.SetElemType(MklDnnType()); + output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), + output_dims_mkl_order, output_tf_format); + + // Allocate shape of TF tensor. + TensorShape output_tf_shape; + output_tf_shape.AddDim((dst_md.get_size() / sizeof(Toutput))); + + AllocateOutputSetMklShape(context, kOutputIndex_Dst, output_tensor, + output_tf_shape, output_mkl_shape); + } +#else virtual void AllocateOutputTensor(OpKernelContext* context, const ConvFwdPd& conv_prim_desc, const memory::dims& output_dims_mkl_order, @@ -862,8 +1064,13 @@ class MklConvOp : public OpKernel { } } } +#endif +#ifdef ENABLE_MKLDNN_V1 + engine cpu_engine_ = engine(engine::kind::cpu, 0); +#else engine cpu_engine_ = engine(engine::cpu, 0); +#endif private: std::vector strides_; @@ -892,8 +1099,105 @@ class MklConvOp : public OpKernel { const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1; const int kDilationH = 0, kDilationW = 1; +#ifdef ENABLE_MKLDNN_V1 // Allocate persistent tensors for cached filter data and // cached filter memory descriptor (data format) + void AllocatePersistentTensor(OpKernelContext* context, + const ConvFwdPd& conv_prim_desc, + Tensor** filter_tensor, + const MklDnnShape& filter_mkl_shape) { + DCHECK(filter_tensor); + TensorShape filter_tf_shape; + filter_tf_shape.AddDim( + (conv_prim_desc.weights_desc().get_size() / sizeof(Tfilter))); + OP_REQUIRES_OK(context, context->allocate_persistent( + DataTypeToEnum::value, filter_tf_shape, + &cached_filter_data_ptensor_, filter_tensor)); + + Tensor* second_tensor = nullptr; + TensorShape filter_mkl_format; + filter_mkl_format.AddDim(sizeof(filter_mkl_shape.GetTfDataFormat()) / + sizeof(DT_INT32)); + OP_REQUIRES_OK(context, context->allocate_persistent( + DT_INT32, filter_mkl_format, + &cached_filter_md_ptensor_, &second_tensor)); + second_tensor->scalar()() = + static_cast(filter_mkl_shape.GetTfDataFormat()); + } + + void AllocateFilterOutputTensor(OpKernelContext* context, + const ConvFwdPd& conv_prim_desc, + const memory::dims& filter_dims_tf_order, + Tensor** filter_tensor) { + CHECK_NOTNULL(filter_tensor); + auto filter_md = conv_prim_desc.weights_desc(); + + // Allocate shape of Mkl tensor. + MklDnnShape filter_mkl_shape; + filter_mkl_shape.SetMklTensor(true); + filter_mkl_shape.SetMklLayout(&filter_md); + filter_mkl_shape.SetElemType(MklDnnType()); + + // The format of the filter is actually OIhw8i8o, but TF doesn't support + // this format. Just use format::blocked for now because the layout + // is stored in the MKL data. + filter_mkl_shape.SetTfLayout(filter_dims_tf_order.size(), + filter_dims_tf_order, + MklTensorFormat::FORMAT_UNDEF); + + // Allocate the data space for the filter to propagate as TF tensor. + TensorShape filter_tf_shape; + filter_tf_shape.AddDim((filter_md.get_size() / sizeof(Tfilter))); + + AllocateOutputSetMklShape(context, kOutputIndex_Filter, filter_tensor, + filter_tf_shape, filter_mkl_shape); + } + + // Prepare and execute net - checks for input and output reorders. + void PrepareAndExecuteNet(const ConvFwdPd& conv_prim_desc, + MklDnnData* src, + MklDnnData* filter, + MklDnnData* bias, + MklDnnData* output, + Tensor* filter_out_tensor) { + CHECK_NOTNULL(filter_out_tensor); + + // Create reorders between user layout and MKL layout if it is needed and + // add it to the net before convolution. No need to check for output + // reorder as we propagate output layout to the next layer. + src->CheckReorderToOpMem(conv_prim_desc.src_desc(), cpu_engine_); + + // rather than re-order to a temp buffer, reorder directly to the + // filter output tensor + filter->CheckReorderToOpMem(conv_prim_desc.weights_desc(), + filter->GetTensorBuffer(filter_out_tensor)); + + // Create convolution primitive and add it to net. + std::vector net; + std::vector> net_args; + if (bias) { + DCHECK(fuse_biasadd_); + net.push_back(convolution_forward(conv_prim_desc)); + net_args.push_back({{MKLDNN_ARG_SRC, src->GetOpMem()}, + {MKLDNN_ARG_WEIGHTS, filter->GetOpMem()}, + {MKLDNN_ARG_BIAS, bias->GetOpMem()}, + {MKLDNN_ARG_DST, output->GetOpMem()}}); + } else { + DCHECK(!fuse_biasadd_); + net.push_back(convolution_forward(conv_prim_desc)); + net_args.push_back({{MKLDNN_ARG_SRC, src->GetOpMem()}, + {MKLDNN_ARG_WEIGHTS, filter->GetOpMem()}, + {MKLDNN_ARG_DST, output->GetOpMem()}}); + } + stream cpu_stream(cpu_engine_); + + CHECK_EQ(net.size(), net_args.size()); + for (size_t i = 0; i < net.size(); ++i) { + net.at(i).execute(cpu_stream, net_args.at(i)); + } + cpu_stream.wait(); + } +#else void AllocatePersistentTensor(OpKernelContext* context, const ConvFwdPd& conv_prim_desc, Tensor** filter_tensor) { @@ -979,6 +1283,7 @@ class MklConvOp : public OpKernel { stream(stream::kind::eager).submit(net).wait(); } +#endif // LOCKS_EXCLUDED annotation ensures that the lock (mu_) cannot // be acquired before entering the function, since it is acquired @@ -990,6 +1295,37 @@ class MklConvOp : public OpKernel { return (cached_filter_data_tensor.NumElements() == 0); } +#ifdef ENABLE_MKLDNN_V1 + // Cache the converted filter in a persistent tensor. + // Only one thread can execute this method at any given time. + void CacheFilter(OpKernelContext* context, + const std::shared_ptr& conv_fwd_pd, + Tfilter* filter_data, const Tensor& filter_tensor, + MklDnnData& filter, const memory::desc& filter_md, + const MklDnnShape& filter_mkl_shape) LOCKS_EXCLUDED(mu_) { + mutex_lock lock(mu_); + const Tensor& cached_filter_data_tensor = + *cached_filter_data_ptensor_.AccessTensor(context); + + // If filter is already cached, there's nothing to do. + if (cached_filter_data_tensor.NumElements() > 0) { + return; + } + + // Otherwise, cache filter + filter.SetUsrMem(filter_md, &filter_tensor); + filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_desc(), + this->cpu_engine_); + filter_data = static_cast(filter.GetOpMem().get_data_handle()); + + Tensor* filter_tensor_ptr = nullptr; + AllocatePersistentTensor(context, *conv_fwd_pd, &filter_tensor_ptr, + filter_mkl_shape); + void* cached_filter_data = filter.GetTensorBuffer(filter_tensor_ptr); + size_t cached_filter_data_size = filter.GetOpMem().get_desc().get_size(); + memcpy(cached_filter_data, filter_data, cached_filter_data_size); + } +#else // Cache the converted filter in a persistent tensor. // Only one thread can execute this method at any given time. void CacheFilter(OpKernelContext* context, @@ -1018,7 +1354,45 @@ class MklConvOp : public OpKernel { filter.GetOpMem().get_primitive_desc().get_size(); memcpy(cached_filter_data, filter_data, cached_filter_data_size); } +#endif +#ifdef ENABLE_MKLDNN_V1 + bool AreMemoryDescriptorsEqual(const memory::desc& filter_md, + const Tensor& cached_filter_md) { + auto filter_md_data = filter_md.data; + const char* filter_data = reinterpret_cast(&filter_md_data); + + auto cached_filter_md_data = cached_filter_md.scalar()(); + const char* cached_filter_data = + reinterpret_cast(&cached_filter_md_data); + + for (size_t i = 0; i < sizeof(filter_md_data); ++i) { + if (*filter_data++ != *cached_filter_data++) { + return false; + } + } + return true; + } + + Tfilter* GetCachedFilter(OpKernelContext* context, + const memory::desc& filter_md) LOCKS_EXCLUDED(mu_) { + tf_shared_lock lock(mu_); + const Tensor& cached_filter_data = + *cached_filter_data_ptensor_.AccessTensor(context); + const Tensor& cached_filter_md = + *cached_filter_md_ptensor_.AccessTensor(context); + + // Check if the memory descriptor of the cached weights is same as + // filter_mf. If so, we can used the cached weights; otherwise + // return NULL. + if (cached_filter_md.scalar().size() && + AreMemoryDescriptorsEqual(filter_md, cached_filter_md)) { + return static_cast( + const_cast(cached_filter_data.flat().data())); + } + return nullptr; + } +#else Tfilter* GetCachedFilter(OpKernelContext* context, const memory::format& filter_mf) LOCKS_EXCLUDED(mu_) { @@ -1039,6 +1413,7 @@ class MklConvOp : public OpKernel { } return nullptr; } +#endif }; // Base class for fused convolution forward operations @@ -1294,6 +1669,9 @@ class MklQuantizedConv2DOp const float* max_filter = max_filter_vector.flat().data(); std::vector net; +#ifdef ENABLE_MKLDNN_V1 + std::vector> net_args; +#endif if (bias_enabled) { if (std::is_same::value) { return static_cast( @@ -1315,6 +1693,32 @@ class MklQuantizedConv2DOp } else { bias_attr.set_output_scales(1, scales); } +#ifdef ENABLE_MKLDNN_V1 + auto bias_md = + memory::desc({static_cast(bias_tensor.NumElements())}, + MklDnnType(), memory::format_tag::x); + + void* bias_buf = static_cast( + const_cast(bias_tensor.flat().data())); + input_bias_ = new memory(bias_md, this->cpu_engine_, bias_buf); + scaled_bias_ = new memory(conv_fwd_pd->bias_desc(), this->cpu_engine_); + auto reorder_desc = mkldnn::reorder::primitive_desc( + this->cpu_engine_, input_bias_->get_desc(), this->cpu_engine_, + scaled_bias_->get_desc(), bias_attr); + net.push_back(mkldnn::reorder(reorder_desc)); + net_args.push_back({{MKLDNN_ARG_FROM, *input_bias_}, + {MKLDNN_ARG_TO, *scaled_bias_}}); + + CHECK_EQ(net.size(), net_args.size()); + + stream cpu_stream(this->cpu_engine_); + for (size_t i = 0; i < net.size(); ++i) { + net.at(i).execute(cpu_stream, net_args.at(i)); + } + cpu_stream.wait(); + + return reinterpret_cast(scaled_bias_->get_data_handle()); +#else auto bias_pd = memory::primitive_desc({{static_cast(bias_tensor.NumElements())}, MklDnnType(), @@ -1331,6 +1735,7 @@ class MklQuantizedConv2DOp net.push_back(mkldnn::reorder(reorder_desc, *input_bias_, *scaled_bias_)); stream(stream::kind::eager).submit(net).wait(); return reinterpret_cast(scaled_bias_->get_data_handle()); +#endif } else { return nullptr; } @@ -1431,7 +1836,11 @@ class MklQuantizedConv2DSumReluOp void AllocateOutputTensor(OpKernelContext* context, const ConvFwdPd& conv_prim_desc, const memory::dims& output_dims_mkl_order, +#ifdef ENABLE_MKLDNN_V1 + MklTensorFormat output_tf_format, +#else memory::format output_tf_format, +#endif Tensor** output_tensor) override { int summand_idx = context->num_inputs() / 2 - 1; if (std::is_same::value) { @@ -1499,6 +1908,36 @@ class MklQuantizedConv2DSumReluOp } else { reorder_attr.set_output_scales(2, scales); } +#ifdef ENABLE_MKLDNN_V1 + auto summand_md = + summand_mkl_shape.IsMklTensor() + ? summand_mkl_shape.GetMklLayout() + : memory::desc(output_dims_mkl_order, MklDnnType(), + memory::format_tag::nhwc); + void* summand_buf = + static_cast(const_cast(summand.flat().data())); + void* dst_buf = + static_cast((*output_tensor)->flat().data()); + summand_ = new memory(summand_md, this->cpu_engine_, summand_buf); + dst_ = new memory(conv_prim_desc.dst_desc(), this->cpu_engine_, dst_buf); + auto reorder_desc = mkldnn::reorder::primitive_desc( + this->cpu_engine_, summand_md, this->cpu_engine_, + conv_prim_desc.dst_desc(), reorder_attr); + + std::vector net; + std::vector> net_args; + + net.push_back(mkldnn::reorder(reorder_desc)); + net_args.push_back({{MKLDNN_ARG_FROM, *summand_}, + {MKLDNN_ARG_TO, *dst_}}); + CHECK_EQ(net.size(), net_args.size()); + + stream cpu_stream(this->cpu_engine_); + for (size_t i = 0; i < net.size(); ++i) { + net.at(i).execute(cpu_stream, net_args.at(i)); + } + cpu_stream.wait(); +#else auto summand_md = summand_mkl_shape.IsMklTensor() ? summand_mkl_shape.GetMklLayout() @@ -1517,6 +1956,7 @@ class MklQuantizedConv2DSumReluOp std::vector net; net.push_back(mkldnn::reorder(reorder_desc, *summand_, *dst_)); stream(stream::kind::eager).submit(net).wait(); +#endif } memory* summand_ = nullptr; diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h index c12a4ff0f0c..2399f5213a3 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.h +++ b/tensorflow/core/kernels/mkl_conv_ops.h @@ -40,7 +40,9 @@ limitations under the License. #include "tensorflow/core/util/padding.h" #include "tensorflow/core/util/tensor_format.h" +#ifndef ENABLE_MKLDNN_V1 using mkldnn::convolution_direct; +#endif using mkldnn::convolution_forward; using mkldnn::prop_kind; using mkldnn::stream; @@ -136,8 +138,13 @@ class MklDnnConvUtil { CHECK_BOUNDS(input_cols_raw, "Input cols too large"); int input_cols = static_cast(input_cols_raw); +#ifdef ENABLE_MKLDNN_V1 + // MKL-DNN always requires input in NCHW format Conv2D. + std::vector mkldnn_sizes(4, -1); +#else // MKL-DNN always requires input in NCHW format Conv2D. std::vector mkldnn_sizes(4, -1); +#endif mkldnn_sizes[MklDnnDims::Dim_N] = input_batch; mkldnn_sizes[MklDnnDims::Dim_C] = input_depth; mkldnn_sizes[MklDnnDims::Dim_H] = input_rows; @@ -160,8 +167,13 @@ class MklDnnConvUtil { CHECK_BOUNDS(input_cols_raw, "Input cols too large"); int input_cols = static_cast(input_cols_raw); +#ifdef ENABLE_MKLDNN_V1 + // MKL-DNN always requires input in NCDHW format for Conv3D. + std::vector mkldnn_sizes(5, -1); +#else // MKL-DNN always requires input in NCDHW format for Conv3D. std::vector mkldnn_sizes(5, -1); +#endif mkldnn_sizes[MklDnnDims3D::Dim3d_N] = input_batch; mkldnn_sizes[MklDnnDims3D::Dim3d_C] = input_depth; mkldnn_sizes[MklDnnDims3D::Dim3d_D] = input_planes; @@ -196,9 +208,8 @@ class MklDnnConvUtil { filter_shape.DebugString())); for (int i = 0; i < ((strides_.size() == 4) ? 3 : 5); i++) { - OP_REQUIRES(context_, - FastBoundsCheck(filter_shape.dim_size(i), - std::numeric_limits::max()), + OP_REQUIRES(context_, FastBoundsCheck(filter_shape.dim_size(i), + std::numeric_limits::max()), errors::InvalidArgument("filter too large")); } @@ -225,7 +236,11 @@ class MklDnnConvUtil { // GOIHW = (group, out_depth, in_depth, rows, cols) // Specifically for depthwise G=filter_indepth, O=filter_outdepth, I=1 if (is_depthwise) { +#ifdef ENABLE_MKLDNN_V1 + std::vector mkldnn_sizes(5, -1); +#else std::vector mkldnn_sizes(5, -1); +#endif mkldnn_sizes[MKL_GROUP_FILTER_DIM_G] = filter_in_depth; mkldnn_sizes[MKL_GROUP_FILTER_DIM_O] = filter_out_depth; mkldnn_sizes[MKL_GROUP_FILTER_DIM_I] = 1; @@ -234,7 +249,11 @@ class MklDnnConvUtil { *filter_dims = mkldnn_sizes; } else { +#ifdef ENABLE_MKLDNN_V1 + std::vector mkldnn_sizes(4, -1); +#else std::vector mkldnn_sizes(4, -1); +#endif mkldnn_sizes[MklDnnDims::Dim_O] = filter_out_depth; mkldnn_sizes[MklDnnDims::Dim_I] = filter_in_depth; mkldnn_sizes[MklDnnDims::Dim_H] = filter_rows; @@ -260,9 +279,15 @@ class MklDnnConvUtil { int filter_out_depth = static_cast(filter_shape.dim_size(TF_3DFILTER_DIM_O)); +#ifdef ENABLE_MKLDNN_V1 + // MKL-DNN always needs filter in OIDHW format. + // OIDHW = (out_depth, in_depth, planes, rows, cols) + std::vector mkldnn_sizes(5, -1); +#else // MKL-DNN always needs filter in OIDHW format. // OIDHW = (out_depth, in_depth, planes, rows, cols) std::vector mkldnn_sizes(5, -1); +#endif mkldnn_sizes[MklDnnDims3D::Dim3d_O] = filter_out_depth; mkldnn_sizes[MklDnnDims3D::Dim3d_I] = filter_in_depth; mkldnn_sizes[MklDnnDims3D::Dim3d_D] = filter_planes; @@ -451,15 +476,24 @@ class MklDnnConvUtil { *output_dims_tf_order = TFShapeToMklDnnDims(out_shape); if (is_conv2d) { +#ifdef ENABLE_MKLDNN_V1 + // For Conv2D, MKL-DNN always needs output in NCHW format. + std::vector mkldnn_sizes(4, -1); +#else // For Conv2D, MKL-DNN always needs output in NCHW format. std::vector mkldnn_sizes(4, -1); +#endif mkldnn_sizes[MklDnnDims::Dim_N] = out_batch; mkldnn_sizes[MklDnnDims::Dim_C] = out_depth; mkldnn_sizes[MklDnnDims::Dim_H] = static_cast(out_rows); mkldnn_sizes[MklDnnDims::Dim_W] = static_cast(out_cols); *output_dims_mkl_order = mkldnn_sizes; } else { +#ifdef ENABLE_MKLDNN_V1 + std::vector mkldnn_sizes(5, -1); +#else std::vector mkldnn_sizes(5, -1); +#endif mkldnn_sizes[MklDnnDims3D::Dim3d_N] = out_batch; mkldnn_sizes[MklDnnDims3D::Dim3d_C] = out_depth; mkldnn_sizes[MklDnnDims3D::Dim3d_D] = static_cast(out_planes); From 89901f842d029ed8f20bc6d3a01ffb93633baef3 Mon Sep 17 00:00:00 2001 From: Dayananda-V Date: Wed, 3 Jul 2019 14:49:02 +0530 Subject: [PATCH 0077/3053] [Lite]Bugfix System.loadLibrary exception handle when application fail to load --- .../com/example/android/smartreply/SmartReplyClient.java | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java b/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java index fbd75051e71..cbd155bb0cd 100644 --- a/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java +++ b/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java @@ -53,8 +53,13 @@ public class SmartReplyClient implements AutoCloseable { @WorkerThread public synchronized void loadModel() { if (!isLibraryLoaded) { - System.loadLibrary(JNI_LIB); - isLibraryLoaded = true; + try { + System.loadLibrary(JNI_LIB); + isLibraryLoaded = true; + } catch (Exception e) { + Log.e(TAG, "Failed to load prebuilt smartreply_jni lib", e); + return; + } } try { From 2f61f75e244891e9ce1d10fa3a34fd4cb419a5d4 Mon Sep 17 00:00:00 2001 From: Trevor Morris Date: Wed, 10 Jul 2019 10:13:45 -0700 Subject: [PATCH 0078/3053] Cast shape to integer. Fix formatting. --- tensorflow/python/keras/layers/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py index eb45636e677..b21801786d9 100644 --- a/tensorflow/python/keras/layers/core.py +++ b/tensorflow/python/keras/layers/core.py @@ -582,11 +582,11 @@ class Flatten(Layer): input_shape = tensor_shape.TensorShape(inputs.shape).as_list() if input_shape and all(input_shape[1:]): - outputs = array_ops.reshape(inputs, (-1, np.prod(input_shape[1:]))) + outputs = array_ops.reshape(inputs, (-1, int(np.prod(input_shape[1:])))) else: outputs = array_ops.reshape( inputs, (tensor_shape.dimension_value(inputs.shape[0]) or - array_ops.shape(inputs)[0], -1)) + array_ops.shape(inputs)[0], -1)) if not context.executing_eagerly(): outputs.set_shape(self.compute_output_shape(inputs.shape)) return outputs From 3608a971bb3413e55494497e6b30a3e1b46aec5b Mon Sep 17 00:00:00 2001 From: Bhavani Subramanian Date: Wed, 10 Jul 2019 10:25:27 -0700 Subject: [PATCH 0079/3053] Changed CHECK to DCHECK. --- tensorflow/core/kernels/mkl_conv_ops.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index 39cc4da3ce0..b9ef04413c9 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -133,7 +133,7 @@ class MklConvFwdPrimitive : public MklPrimitive { context_.dst_mem->set_data_handle( static_cast(const_cast(dst_data))); #ifdef ENABLE_MKLDNN_V1 - CHECK_EQ(context_.fwd_primitives.size(), + DCHECK_EQ(context_.fwd_primitives.size(), context_.fwd_primitives_args.size()); for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) { context_.fwd_primitives.at(i).execute(*context_.fwd_stream, @@ -165,7 +165,7 @@ class MklConvFwdPrimitive : public MklPrimitive { context_.dst_mem->set_data_handle( static_cast(const_cast(dst_data))); #ifdef ENABLE_MKLDNN_V1 - CHECK_EQ(context_.fwd_primitives.size(), + DCHECK_EQ(context_.fwd_primitives.size(), context_.fwd_primitives_args.size()); for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) { context_.fwd_primitives.at(i).execute(*context_.fwd_stream, @@ -680,7 +680,7 @@ class MklConvOp : public OpKernel { #ifdef ENABLE_MKLDNN_V1 auto mkl_fmt_tag = MklTensorFormatToMklDnnDataFormat(tf_fmt); // NOTE: `mkl_fmt_tag` will be `format_tag::undef` for ReLU - CHECK_NE(mkl_fmt_tag, memory::format_tag::undef); + DCHECK_NE(mkl_fmt_tag, memory::format_tag::undef); #endif // If input is in MKL layout, then simply grab the layout; otherwise, @@ -1191,7 +1191,7 @@ class MklConvOp : public OpKernel { } stream cpu_stream(cpu_engine_); - CHECK_EQ(net.size(), net_args.size()); + DCHECK_EQ(net.size(), net_args.size()); for (size_t i = 0; i < net.size(); ++i) { net.at(i).execute(cpu_stream, net_args.at(i)); } @@ -1709,7 +1709,7 @@ class MklQuantizedConv2DOp net_args.push_back({{MKLDNN_ARG_FROM, *input_bias_}, {MKLDNN_ARG_TO, *scaled_bias_}}); - CHECK_EQ(net.size(), net_args.size()); + DCHECK_EQ(net.size(), net_args.size()); stream cpu_stream(this->cpu_engine_); for (size_t i = 0; i < net.size(); ++i) { @@ -1930,7 +1930,7 @@ class MklQuantizedConv2DSumReluOp net.push_back(mkldnn::reorder(reorder_desc)); net_args.push_back({{MKLDNN_ARG_FROM, *summand_}, {MKLDNN_ARG_TO, *dst_}}); - CHECK_EQ(net.size(), net_args.size()); + DCHECK_EQ(net.size(), net_args.size()); stream cpu_stream(this->cpu_engine_); for (size_t i = 0; i < net.size(); ++i) { From fdf9ee647ef267f847d11173d3c391e57762a9c9 Mon Sep 17 00:00:00 2001 From: Bhavani Subramanian Date: Wed, 10 Jul 2019 11:05:12 -0700 Subject: [PATCH 0080/3053] Ran Clang format checks. --- tensorflow/core/kernels/mkl_conv_ops.cc | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index b9ef04413c9..d7a457e3729 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -134,7 +134,7 @@ class MklConvFwdPrimitive : public MklPrimitive { static_cast(const_cast(dst_data))); #ifdef ENABLE_MKLDNN_V1 DCHECK_EQ(context_.fwd_primitives.size(), - context_.fwd_primitives_args.size()); + context_.fwd_primitives_args.size()); for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) { context_.fwd_primitives.at(i).execute(*context_.fwd_stream, context_.fwd_primitives_args.at(i)); @@ -166,7 +166,7 @@ class MklConvFwdPrimitive : public MklPrimitive { static_cast(const_cast(dst_data))); #ifdef ENABLE_MKLDNN_V1 DCHECK_EQ(context_.fwd_primitives.size(), - context_.fwd_primitives_args.size()); + context_.fwd_primitives_args.size()); for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) { context_.fwd_primitives.at(i).execute(*context_.fwd_stream, context_.fwd_primitives_args.at(i)); @@ -387,13 +387,15 @@ class MklConvFwdPrimitive : public MklPrimitive { {{MKLDNN_ARG_SRC, *context_.src_mem}, {MKLDNN_ARG_WEIGHTS, *context_.filter_mem}, {MKLDNN_ARG_BIAS, *context_.bias_mem}, - {MKLDNN_ARG_DST, *context_.dst_mem}}); + { MKLDNN_ARG_DST, + *context_.dst_mem }}); } else { context_.conv_fwd.reset(new convolution_forward(*context_.fwd_pd)); context_.fwd_primitives_args.push_back( {{MKLDNN_ARG_SRC, *context_.src_mem}, {MKLDNN_ARG_WEIGHTS, *context_.filter_mem}, - {MKLDNN_ARG_DST, *context_.dst_mem}}); + { MKLDNN_ARG_DST, + *context_.dst_mem }}); } context_.fwd_primitives.push_back(*context_.conv_fwd); return; @@ -804,7 +806,7 @@ class MklConvOp : public OpKernel { cpu_engine_); } else { filter.CheckReorderToOpMem( - conv_fwd_pd->weights_desc(), + conv_fwd_pd->weights_desc(), filter.GetTensorBuffer(filter_out_tensor), cpu_engine_); } filter_data = @@ -1181,13 +1183,15 @@ class MklConvOp : public OpKernel { net_args.push_back({{MKLDNN_ARG_SRC, src->GetOpMem()}, {MKLDNN_ARG_WEIGHTS, filter->GetOpMem()}, {MKLDNN_ARG_BIAS, bias->GetOpMem()}, - {MKLDNN_ARG_DST, output->GetOpMem()}}); + { MKLDNN_ARG_DST, + output->GetOpMem() }}); } else { DCHECK(!fuse_biasadd_); net.push_back(convolution_forward(conv_prim_desc)); net_args.push_back({{MKLDNN_ARG_SRC, src->GetOpMem()}, {MKLDNN_ARG_WEIGHTS, filter->GetOpMem()}, - {MKLDNN_ARG_DST, output->GetOpMem()}}); + { MKLDNN_ARG_DST, + output->GetOpMem() }}); } stream cpu_stream(cpu_engine_); @@ -1707,7 +1711,8 @@ class MklQuantizedConv2DOp scaled_bias_->get_desc(), bias_attr); net.push_back(mkldnn::reorder(reorder_desc)); net_args.push_back({{MKLDNN_ARG_FROM, *input_bias_}, - {MKLDNN_ARG_TO, *scaled_bias_}}); + { MKLDNN_ARG_TO, + *scaled_bias_ }}); DCHECK_EQ(net.size(), net_args.size()); @@ -1929,7 +1934,8 @@ class MklQuantizedConv2DSumReluOp net.push_back(mkldnn::reorder(reorder_desc)); net_args.push_back({{MKLDNN_ARG_FROM, *summand_}, - {MKLDNN_ARG_TO, *dst_}}); + { MKLDNN_ARG_TO, + *dst_ }}); DCHECK_EQ(net.size(), net_args.size()); stream cpu_stream(this->cpu_engine_); From da49f65bf7d6d01225f871e14cf4d57dd9304df5 Mon Sep 17 00:00:00 2001 From: TengLu Date: Thu, 11 Jul 2019 10:19:34 +0800 Subject: [PATCH 0081/3053] Update mkl_layout_pass.cc Change the code style according to review suggestion. --- tensorflow/core/graph/mkl_layout_pass.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index 5a4c211c194..f12334358de 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -2319,7 +2319,7 @@ void MklLayoutRewritePass::CopyAttrsAll(const Node* orig_node, NodeBuilder* nb, name = iter->first; auto attr = iter->second; nb->Attr(name, attr); - iter++; + ++iter; } } From aad6f1bb761bf4244a0a7c35afa2932638015478 Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 26 Mar 2019 21:31:01 +0530 Subject: [PATCH 0082/3053] Fixed warning for the FloorMod. Removed the warning from the file. --- tensorflow/lite/kernels/internal/reference/reference_ops.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h index ce34f525c37..2141ab82140 100644 --- a/tensorflow/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h @@ -2631,8 +2631,8 @@ T FloorMod(T input1, T input2) { std::modulus, FloatMod>::type; ModFunc mod_func; T trunc_mod = mod_func(input1, input2); - return trunc_mod != 0 && ((input2 < 0) != (trunc_mod < 0)) - ? trunc_mod + input2 + return (trunc_mod != 0) && ((input2 < 0) != (trunc_mod < 0)) + ? (trunc_mod + input2) : trunc_mod; } From fa52c1c13f8746c05759ffe850f5caa5519cb4ad Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 23 Apr 2019 06:35:22 +0530 Subject: [PATCH 0083/3053] Bug Fix and removed trivail warning from the file. Added a Bug Fix, TC and removed warnings from the file. --- tensorflow/lite/arena_planner.cc | 23 ++++++++++++++--------- tensorflow/lite/arena_planner_test.cc | 12 ++++++++++++ 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/tensorflow/lite/arena_planner.cc b/tensorflow/lite/arena_planner.cc index e695c43f13a..3258f612c18 100644 --- a/tensorflow/lite/arena_planner.cc +++ b/tensorflow/lite/arena_planner.cc @@ -153,7 +153,7 @@ TfLiteStatus ArenaPlanner::PlanAllocations() { } } // Go through the graph in execution order. - for (int i = 0; i < graph_info_->num_nodes(); ++i) { + for (size_t i = 0; i < graph_info_->num_nodes(); ++i) { const TfLiteNode& node = graph_info_->node(i); // First queue output tensors for allocation. @@ -193,7 +193,7 @@ TfLiteStatus ArenaPlanner::ExecuteAllocations(int first_node, int last_node) { TF_LITE_ENSURE_STATUS(CalculateAllocations(first_node, last_node)); TF_LITE_ENSURE_STATUS(Commit()); - for (int i = 0; i < graph_info_->num_tensors(); ++i) { + for (int i = 0; i < static_cast(graph_info_->num_tensors()); ++i) { // TODO(ahentz): we could do this only for the tensors that were modified // in CalculateAllocations(), instead of redoing it for tensors that // already had proper pointers. However we must be very careful, because @@ -237,9 +237,14 @@ TfLiteStatus ArenaPlanner::CalculateAllocations(int first_node, int last_node) { } } - // Don't forget to deallocate temporaries of last node. - TF_LITE_ENSURE_STATUS( - CalculateDeallocationOfInternalTensors(active_node - 1)); + // For the case if the graph is empty the node index can be negative since we + // substract from the active node, so the node_index can be zero for those + // cases + if (active_node > 0) { + // Don't forget to deallocate temporaries of last node. + TF_LITE_ENSURE_STATUS( + CalculateDeallocationOfInternalTensors(active_node - 1)); + } return kTfLiteOk; } @@ -284,8 +289,8 @@ TfLiteStatus ArenaPlanner::CalculateTensorDeallocation(int tensor_index) { TfLiteStatus ArenaPlanner::CalculateAllocationOfInternalTensors( int node_index) { - if (node_index < graph_info_->num_nodes()) { - const TfLiteNode& node = graph_info_->node(node_index); + if (node_index < static_cast(graph_info_->num_nodes())) { + const TfLiteNode& node = graph_info_->node(static_cast(node_index)); TfLiteIntArray* node_temporaries = node.temporaries; for (int i = 0; i < node_temporaries->size; ++i) { int tensor_index = node_temporaries->data[i]; @@ -297,8 +302,8 @@ TfLiteStatus ArenaPlanner::CalculateAllocationOfInternalTensors( TfLiteStatus ArenaPlanner::CalculateDeallocationOfInternalTensors( int node_index) { - if (node_index < graph_info_->num_nodes()) { - const TfLiteNode& node = graph_info_->node(node_index); + if (node_index < static_cast(graph_info_->num_nodes())) { + const TfLiteNode& node = graph_info_->node(static_cast(node_index)); TfLiteIntArray* node_temporaries = node.temporaries; for (int i = 0; i < node_temporaries->size; ++i) { int tensor_index = node_temporaries->data[i]; diff --git a/tensorflow/lite/arena_planner_test.cc b/tensorflow/lite/arena_planner_test.cc index 3b6c9d5f54d..0e80d429c0d 100644 --- a/tensorflow/lite/arena_planner_test.cc +++ b/tensorflow/lite/arena_planner_test.cc @@ -211,6 +211,18 @@ TEST_F(ArenaPlannerTest, EmptyGraph) { Execute(0, 10); } +TEST_F(ArenaPlannerTest, DeallocationOfInputTensor) { + // This is a negative TC, which will try to make sure that no allocation for + // input tensors is done, when making call with negative node_index, since + // previous check was doing comparison of node_index which was int and + // unsigned int, implicit conversion was passing this case, as the negative + // number was converted to unsigned it making it invalid.The new check + // takes care of this problem and removes the warning as well. + TestGraph graph({-1}, {}, {1}); + SetGraph(&graph); + Execute(0, 10); +} + TEST_F(ArenaPlannerTest, GraphWithNoOps) { TestGraph graph({0, 10}, {}, {5, 11}); SetGraph(&graph); From 24a66e3cac31058f1d1557056a831cb710511512 Mon Sep 17 00:00:00 2001 From: Lukas Folle Date: Thu, 11 Jul 2019 10:29:23 +0200 Subject: [PATCH 0084/3053] Fixed too long lines. --- tensorflow/python/keras/backend.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py index 3e1cc87eee9..108678631a9 100644 --- a/tensorflow/python/keras/backend.py +++ b/tensorflow/python/keras/backend.py @@ -2689,7 +2689,8 @@ def repeat_elements(x, rep, axis): ```python >>> b = tf.constant([1, 2, 3]) >>> tf.keras.backend.repeat_elements(b, rep=2, axis=0) - + ``` """ x_shape = x.shape.as_list() @@ -2788,7 +2789,8 @@ def arange(start, stop=None, step=1, dtype='int32'): Example: ```python >>> tf.keras.backend.arange(start=0, stop=10, step=1.5) - + ``` @@ -2837,7 +2839,8 @@ def flatten(x): array([[1, 2], [3, 4]], dtype=int32)> >>> tf.keras.backend.flatten(b) - + ``` """ return array_ops.reshape(x, [-1]) From 7a2419faaa925a86f674bba927ad0881dcde7805 Mon Sep 17 00:00:00 2001 From: jerryyin Date: Tue, 9 Jul 2019 12:15:17 -0500 Subject: [PATCH 0085/3053] Addressing review comments --- .../core/kernels/depthwise_conv_op_gpu.h | 28 ++++++------------- 1 file changed, 9 insertions(+), 19 deletions(-) diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.h b/tensorflow/core/kernels/depthwise_conv_op_gpu.h index ec13259127e..73606a80273 100644 --- a/tensorflow/core/kernels/depthwise_conv_op_gpu.h +++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.h @@ -78,7 +78,7 @@ inline EIGEN_DEVICE_FUNC bool CanLaunchDepthwiseConv2dBackpropFilterGPUSmall( // convolution depending on a template argument of this enum. enum DepthwiseConv2dDirection { DIRECTION_FORWARD, DIRECTION_BACKWARD }; -// A Gpu kernel to compute the depthwise convolution forward pass +// A GPU kernel to compute the depthwise convolution forward pass // in NHWC format. template @@ -191,10 +191,8 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall( typedef typename detail::PseudoHalfType::Type S; assert(CanLaunchDepthwiseConv2dGPUSmall(args)); // Holds block plus halo and filter data for blockDim.x depths. - static_assert(sizeof(S) <= 8, "Insufficient alignment detected"); - GPU_DYNAMIC_SHARED_MEM_DECL(8, unsigned char, shared_memory); - + static_assert(sizeof(S) <= 8, "Insufficient alignment detected"); S* const shared_data = reinterpret_cast(shared_memory); const int num_batches = args.batch; @@ -324,7 +322,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall( } } -// A Gpu kernel to compute the depthwise convolution forward pass +// A GPU kernel to compute the depthwise convolution forward pass // in NCHW format. template @@ -481,10 +479,8 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall( typedef typename detail::PseudoHalfType::Type S; assert(CanLaunchDepthwiseConv2dGPUSmall(args)); // Holds block plus halo and filter data for blockDim.z depths. - static_assert(sizeof(S) <= 8, "Insufficient alignment detected"); - GPU_DYNAMIC_SHARED_MEM_DECL(8, unsigned char, shared_memory); - + static_assert(sizeof(S) <= 8, "Insufficient alignment detected"); S* const shared_data = reinterpret_cast(shared_memory); const int num_batches = args.batch; @@ -782,7 +778,7 @@ Status LaunchDepthwiseConv2dGPU(OpKernelContext* ctx, const DepthwiseArgs& args, } } -// A simple launch pad to launch the Gpu kernel for depthwise convolution. +// A simple launch pad to launch the GPU kernel for depthwise convolution. template void LaunchDepthwiseConvOp::operator()(OpKernelContext* ctx, const DepthwiseArgs& args, @@ -1001,7 +997,7 @@ Status LaunchDepthwiseConv2dBackpropInputGPU(OpKernelContext* ctx, } } -// A simple launch pad to launch the Gpu kernel for depthwise convolution. +// A simple launch pad to launch the GPU kernel for depthwise convolution. template void LaunchDepthwiseConvBackpropInputOp::operator()( OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop, @@ -1126,11 +1122,7 @@ __global__ void __launch_bounds__(640, 2) // Device function to compute sub-warp sum reduction for a power-of-two group of // neighboring threads. template -#if GOOGLE_CUDA __device__ __forceinline__ T WarpSumReduce(T val) { -#elif TENSORFLOW_USE_ROCM -__device__ inline T WarpSumReduce(T val) { -#endif // support only power-of-two widths. assert(__popc(kWidth) == 1); int sub_warp = GpuLaneId() / kWidth; @@ -1165,10 +1157,8 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall( typedef typename detail::PseudoHalfType::Type S; assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z)); // Holds block plus halo and filter data for blockDim.x depths. - static_assert(sizeof(S) <= 8, "Insufficient alignment detected"); - GPU_DYNAMIC_SHARED_MEM_DECL(8, unsigned char, shared_memory); - + static_assert(sizeof(S) <= 8, "Insufficient alignment detected"); S* const shared_data = reinterpret_cast(shared_memory); const int num_batches = args.batch; @@ -1310,7 +1300,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall( } } -// A Gpu kernel to compute the depthwise convolution backprop w.r.t. filter. +// A GPU kernel to compute the depthwise convolution backprop w.r.t. filter. template __global__ void __launch_bounds__(640, 2) @@ -1754,7 +1744,7 @@ Status LaunchDepthwiseConv2dBackpropFilterGPU( } } -// A simple launch pad to launch the Gpu kernel for depthwise convolution. +// A simple launch pad to launch the GPU kernel for depthwise convolution. template void LaunchDepthwiseConvBackpropFilterOp::operator()( OpKernelContext* ctx, const DepthwiseArgs& args, const T* out_backprop, From 53d014aa871e268c37ced38c8f574f611a0514d2 Mon Sep 17 00:00:00 2001 From: jerryyin Date: Thu, 11 Jul 2019 14:56:04 -0500 Subject: [PATCH 0086/3053] Adding ROCm support to depthwise_conv_grad_op --- .../core/kernels/depthwise_conv_grad_op.cc | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc index b29e8323332..5ddcf1d816b 100644 --- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc +++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc @@ -37,10 +37,14 @@ limitations under the License. #include "tensorflow/core/util/use_cudnn.h" #include "tensorflow/core/util/work_sharder.h" +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM + #if GOOGLE_CUDA #include "third_party/gpus/cudnn/cudnn.h" +#endif + #include "tensorflow/core/platform/stream_executor.h" -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM namespace tensorflow { @@ -517,7 +521,7 @@ extern template struct LaunchConv2DBackpropInputOp; extern template struct LaunchConv2DBackpropInputOp; extern template struct LaunchConv2DBackpropInputOp; -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM // Extern template instantiated in conv_grad_input_ops.cc. extern template struct LaunchConv2DBackpropInputOp; @@ -530,7 +534,7 @@ extern template struct LaunchDepthwiseConvBackpropInputOp; extern template struct LaunchDepthwiseConvBackpropInputOp; -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM // Kernel to compute the input backprop for depthwise convolution. template @@ -677,7 +681,7 @@ TF_CALL_double(REGISTER_CPU_KERNEL); #endif #undef REGISTER_CPU_KERNEL -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU_KERNEL(T) \ REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \ @@ -715,7 +719,7 @@ TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL); TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL); #undef REGISTER_GROUPED_CONV_KERNEL #endif // CUDNN_VERSION -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM // Kernels to compute the gradients of the filters for depthwise convolution. @@ -991,7 +995,7 @@ extern template struct LaunchConv2DBackpropFilterOp; extern template struct LaunchConv2DBackpropFilterOp; extern template struct LaunchConv2DBackpropFilterOp; -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM // Extern template instantiated in conv_grad_filter_ops.cc. extern template struct LaunchConv2DBackpropFilterOp; @@ -1004,7 +1008,7 @@ extern template struct LaunchDepthwiseConvBackpropFilterOp; extern template struct LaunchDepthwiseConvBackpropFilterOp; -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM // Kernel to compute the filter backprop for depthwise convolution. template @@ -1160,7 +1164,7 @@ TF_CALL_double(REGISTER_CPU_KERNEL); #endif #undef REGISTER_CPU_KERNEL -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define REGISTER_GPU_KERNEL(T) \ REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \ .Device(DEVICE_GPU) \ @@ -1197,6 +1201,6 @@ TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL); TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL); #undef REGISTER_GROUPED_CONV_KERNEL #endif // CUDNN_VERSION -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } // namespace tensorflow From 3cf6dd0238dd90f92719df2ea3bab32445a72813 Mon Sep 17 00:00:00 2001 From: Fred Reiss Date: Thu, 11 Jul 2019 16:34:58 -0700 Subject: [PATCH 0087/3053] Disable warning instead of modifying code --- .../saved_model/integration_tests/integration_scripts.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/examples/saved_model/integration_tests/integration_scripts.py b/tensorflow/examples/saved_model/integration_tests/integration_scripts.py index 8ac44131708..2fce2e6c559 100644 --- a/tensorflow/examples/saved_model/integration_tests/integration_scripts.py +++ b/tensorflow/examples/saved_model/integration_tests/integration_scripts.py @@ -61,4 +61,5 @@ def MaybeRunScriptInstead(): # Append current path to import path and execute `SCRIPT_NAME` main. sys.path.extend([os.path.dirname(__file__)]) module_name = os.environ["SCRIPT_NAME"] - app.run(importlib.import_module(module_name).main) + retval = app.run(importlib.import_module(module_name).main) # pylint: disable=assignment-from-no-return + sys.exit(retval) From a238dd2804e2d6ac108aaee2cbcad00f0d5d7f7d Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Thu, 11 Jul 2019 18:35:46 -0700 Subject: [PATCH 0088/3053] Inital commit: removed serialized string from dynamic TRT engine. --- tensorflow/compiler/tf2tensorrt/BUILD | 7 + .../tf2tensorrt/convert/convert_graph.cc | 81 +++++---- .../tf2tensorrt/convert/convert_graph.h | 12 ++ .../tf2tensorrt/convert/convert_nodes.cc | 40 ++-- .../tf2tensorrt/kernels/trt_engine_op.cc | 74 +++++++- .../tf2tensorrt/kernels/trt_engine_op_test.cc | 35 +++- .../tf2tensorrt/utils/funcdef_to_graphdef.cc | 172 ++++++++++++++++++ .../tf2tensorrt/utils/funcdef_to_graphdef.h | 42 +++++ .../test/tf_trt_integration_test_base.py | 10 +- 9 files changed, 415 insertions(+), 58 deletions(-) create mode 100644 tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc create mode 100644 tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD index bfaae215709..bca101c4a53 100644 --- a/tensorflow/compiler/tf2tensorrt/BUILD +++ b/tensorflow/compiler/tf2tensorrt/BUILD @@ -168,6 +168,7 @@ tf_cuda_cc_test( ":trt_op_kernels", ":trt_op_libs", ":trt_resources", + ":trt_conversion", "@com_google_googletest//:gtest", "//tensorflow/cc:cc_ops", "//tensorflow/cc:ops", @@ -238,11 +239,13 @@ tf_cuda_library( "utils/calibration_resource.cc", "utils/trt_int8_calibrator.cc", "utils/trt_lru_cache.cc", + "utils/funcdef_to_graphdef.cc", ], hdrs = [ "utils/calibration_resource.h", "utils/trt_int8_calibrator.h", "utils/trt_lru_cache.h", + "utils/funcdef_to_graphdef.h", ], deps = [ ":trt_allocator", @@ -250,6 +253,10 @@ tf_cuda_library( ":utils", "//tensorflow/core:framework_headers_lib", "//tensorflow/core:framework_lite", + #"//tensorflow/core:framework", + "//tensorflow/core/grappler:op_types", + "//tensorflow/core:graph", + "//tensorflow/core:gpu_runtime", "//tensorflow/core:lib_proto_parsing", ] + if_tensorrt([":tensorrt_lib"]), ) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index fb5dda9953e..0c2831df275 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -135,6 +135,7 @@ Status GetEngineInfo(const Graph* g, DeviceNameUtils::ParsedName parsed_name; const bool parse_succeeded = DeviceNameUtils::ParseFullName(node_device, &parsed_name); + VLOG(0) << node_device; if (!parse_succeeded || (parse_succeeded && parsed_name.type == "CPU")) { string msg; if (!parse_succeeded) { @@ -441,7 +442,8 @@ Status CreateTRTNode(const ConversionParams& params, segment_string = string(static_cast(engine_data->data()), engine_data->size()); } else { - segment_string = info.segment_graph_def.SerializeAsString(); + //segment_string = info.segment_graph_def.SerializeAsString(); + segment_string = ""; } string prec_string; @@ -461,15 +463,13 @@ Status CreateTRTNode(const ConversionParams& params, } NodeDef trt_node; + //TODO(phillip-kravtsov): use_function_backup: fix this Status status = node_builder.Attr("input_shapes", input_shape_protos) .Attr("output_shapes", output_shape_protos) .Attr("static_engine", info.engine_type == EngineInfo::EngineType::TRTStatic) - .Attr("segment_funcdef_name", - params.use_function_backup - ? StrCat(info.engine_name, "_native_segment") - : "") + .Attr("segment_funcdef_name", StrCat(info.engine_name, "_native_segment")) .Attr("serialized_segment", segment_string) .Attr("calibration_data", "") .Attr("max_cached_engines_count", info.maximum_cached_engines) @@ -539,15 +539,15 @@ Status CreateTRTNode(const ConversionParams& params, } // Function to construct a funcdef from the segment and add it to the graph. -Status RegisterSegmentFunctionToFunctionLibrary(Graph* graph, - const GraphDef& segment, - const string& engine_name) { - Graph sgraph(graph->flib_def()); +Status ModifyGraphForFunctionDef(Graph* graph, + const GraphDef& segment, + Graph* sgraph) { + //Graph sgraph(graph->flib_def()); GraphConstructorOptions gcopts; - TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(gcopts, segment, &sgraph)); + TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(gcopts, segment, sgraph)); std::map io_nodes; int num_inputs = 0; - for (auto n : sgraph.op_nodes()) { + for (auto n : sgraph->op_nodes()) { if (absl::StartsWith(n->name(), kInputPHName)) { num_inputs++; io_nodes.insert({n->name(), n}); @@ -567,12 +567,12 @@ Status RegisterSegmentFunctionToFunctionLibrary(Graph* graph, .Attr("index", i) .Finalize(&nd)); Status s; - auto node_arg = sgraph.AddNode(nd, &s); + auto node_arg = sgraph->AddNode(nd, &s); if (!s.ok()) { LOG(ERROR) << "Couldn't add _Arg node for " << name; } for (auto edge : node->out_edges()) { - sgraph.AddEdge(node_arg, 0, edge->dst(), edge->dst_input()); + sgraph->AddEdge(node_arg, 0, edge->dst(), edge->dst_input()); VLOG(1) << "Updating funcdef input " << node_arg->name() << ":" << 0 << " - > " << edge->dst()->name() << ":" << edge->dst_input(); if (!s.ok()) { @@ -580,7 +580,7 @@ Status RegisterSegmentFunctionToFunctionLibrary(Graph* graph, << " to " << edge->dst()->name() << ":" << edge->dst_input(); } } - sgraph.RemoveNode(node); + sgraph->RemoveNode(node); } for (int i = 0; i < io_nodes.size() - num_inputs; ++i) { @@ -604,34 +604,40 @@ Status RegisterSegmentFunctionToFunctionLibrary(Graph* graph, VLOG(3) << nd.DebugString(); } Status s; - auto node_ret = sgraph.AddNode(nd, &s); + auto node_ret = sgraph->AddNode(nd, &s); if (!s.ok()) { LOG(ERROR) << "Couldn't add _Ret node for " << name; } VLOG(1) << "Update edge from " << edge->src()->name() << ":" << edge->src_output() << " - > " << node_ret->name() << ":" << 0; - sgraph.AddEdge(edge->src(), edge->src_output(), node_ret, 0); - s = sgraph.UpdateEdge(edge->src(), edge->src_output(), node_ret, 0); + sgraph->AddEdge(edge->src(), edge->src_output(), node_ret, 0); + s = sgraph->UpdateEdge(edge->src(), edge->src_output(), node_ret, 0); if (!s.ok()) { LOG(ERROR) << "Failed to update edge from " << edge->src()->name() << ":" << edge->src_output() << " - > " << node_ret->name() << ":" << 0; } - sgraph.RemoveNode(node); + sgraph->RemoveNode(node); } - FunctionDefLibrary fdeflib; + return Status::OK(); +} + +Status RegisterModifiedGraphToFunctionLibrary(Graph* sgraph, Graph* graph, + FunctionDefLibrary fdeflib, + const string& engine_name) { auto native_segment = fdeflib.add_function(); TF_RETURN_IF_ERROR(GraphToFunctionDef( - sgraph, StrCat(engine_name, "_native_segment"), native_segment)); + *sgraph, StrCat(engine_name, "_native_segment"), native_segment)); // Set kIntsonDeviceAttr to true so that all TRTEngineOp outputs are always on // a GPU device as expected. Otherwise, some of the tensors of type DT_INT32 // would be on host if the op generating the tensor has host memory tag set. (*native_segment ->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr] .set_b(true); - if (VLOG_IS_ON(7)) { - VLOG(7) << engine_name << " Function_Def "; - VLOG(7) << native_segment->DebugString(); + //TODO(phillip-kravtsov): set this back to 7 + if (VLOG_IS_ON(0)) { + VLOG(0) << engine_name << " Function_Def "; + VLOG(0) << native_segment->DebugString(); } VLOG(1) << "Adding funcdef to graphlib"; TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdeflib)); @@ -761,14 +767,24 @@ Status ConvertAfterShapes(const ConversionParams& params) { : EngineInfo::EngineType::TRTStatic); curr_engine.use_calibration = params.use_calibration; curr_engine.maximum_cached_engines = params.max_cached_engines; - if (params.use_function_backup) { - status = RegisterSegmentFunctionToFunctionLibrary( - &graph, curr_engine.segment_graph_def, curr_engine.engine_name); - if (!status.ok()) { - LOG(WARNING) << "Failed to register segment graphdef as a function " - << t << ": " << status; - continue; - } + + + Graph sgraph(flib); + status = ModifyGraphForFunctionDef(&graph, curr_engine.segment_graph_def, + &sgraph); + if (!status.ok()) { + LOG(WARNING) << "Failed to modify graph as a function " + << t << ": " << status; + continue; + } + FunctionDefLibrary fdeflib; + status = RegisterModifiedGraphToFunctionLibrary(&sgraph, &graph, + fdeflib, curr_engine.engine_name); + + if (!status.ok()) { + LOG(WARNING) << "Failed to register segment graphdef as a function " + << t << ": " << status; + continue; } engine_bytes_size.push_back(curr_engine.segment_graph_def.ByteSizeLong()); @@ -777,7 +793,8 @@ Status ConvertAfterShapes(const ConversionParams& params) { engine_segments.push_back(std::move(curr_engine)); converted_segments.push_back(std::move(curr_segment)); - if (VLOG_IS_ON(8)) { + if (VLOG_IS_ON(8) && + curr_engine.engine_type == EngineInfo::EngineType::TRTStatic) { string fname = engine_segments.back().engine_name; StrAppend(&fname, ".pb"); std::fstream f; diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h index d7f1df5a102..74135e56cf4 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" #include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/function.pb.h" #include "tensorflow/core/grappler/clusters/cluster.h" #include "tensorflow/core/grappler/costs/graph_properties.h" #include "tensorflow/core/lib/core/status.h" @@ -57,6 +58,17 @@ Status ConvertAfterShapes(const ConversionParams& params); std::pair GetDeviceAndAllocator(const ConversionParams& params, const EngineInfo& engine); +/*Status RegisterSegmentFunctionToFunctionLibrary(Graph* graph, + const GraphDef& segment, + const string& engine_name); + */ +Status ModifyGraphForFunctionDef(Graph* graph, + const GraphDef& segment, + Graph* sgraph); + +Status RegisterModifiedGraphToFunctionLibrary(Graph* sgraph, Graph* graph, + FunctionDefLibrary fdeflib, + const string& engine_name); } // namespace convert } // namespace tensorrt } // namespace tensorflow diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc index c34f85e61a8..efb186c4c55 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc @@ -40,6 +40,7 @@ limitations under the License. #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/grappler/op_types.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/strings/numbers.h" @@ -5016,19 +5017,30 @@ Status ConvertGraphDefToEngine( for (const auto& node_def : gdef.node()) { string node_name = node_def.name(); VLOG(2) << "Converting op name=" << node_name << ", op=" << node_def.op(); - if (IsEngineInput(node_name) && (node_def.op() == "Placeholder")) { + if (IsEngineInput(node_name)){ int32 slot_number = -1; - if (!strings::safe_strto32( // non-absl ok - node_name.c_str() + strlen(kInputPHName), &slot_number)) { - return errors::InvalidArgument("Failed to parse slot number from ", - node_name); + string type_key; + if (node_def.op() == "Placeholder") { + if (!strings::safe_strto32( // non-absl ok + node_name.c_str() + strlen(kInputPHName), &slot_number)) { + return errors::InvalidArgument("Failed to parse slot number from ", + node_name); + } + type_key = "dtype"; + } else if (tensorflow::grappler::IsArg(node_def)) { + // Maybe remove the dependence on grappler and re-implement IsArg, + // which is pretty simple (but could change if new Arg nodes are added) + slot_number = node_def.attr().at("index").i(); + type_key = "T"; + } else { + return errors::InvalidArgument("Node ", node_name, " with name starting with kInputPHName is neither Placeholder nor Arg, instead ", node_def.op()); } nvinfer1::DataType trt_dtype; nvinfer1::Dims trt_dims; int batch_size = -1; auto shape = input_shapes.at(slot_number); auto status = ValidateTensorProperties( - node_def.op(), node_def.attr().at("dtype").type(), shape, + node_def.op(), node_def.attr().at(type_key).type(), shape, /*validation_only=*/false, &trt_dtype, &trt_dims, &batch_size); if (!status.ok()) { const string error_message = @@ -5044,12 +5056,18 @@ Status ConvertGraphDefToEngine( // engines offline, by calling sess.run() and cache/serialize the engines. TF_RETURN_IF_ERROR( converter.AddInputTensor(node_name, trt_dtype, trt_dims, batch_size)); - } else if (IsEngineOutput(node_name) && (node_def.op() == "Identity")) { + } else if (IsEngineOutput(node_name)) { int32 slot_number = -1; - if (!strings::safe_strto32( // non-absl ok - node_name.c_str() + strlen(kOutputPHName), &slot_number)) { - return errors::InvalidArgument("Failed to parse slot number from ", - node_name); + if (node_def.op() == "Identity") { + if (!strings::safe_strto32( // non-absl ok + node_name.c_str() + strlen(kOutputPHName), &slot_number)) { + return errors::InvalidArgument("Failed to parse slot number from ", + node_name); + } + } else if (tensorflow::grappler::IsRetval(node_def)) { + slot_number = node_def.attr().at("index").i(); + } else { + return errors::InvalidArgument("Node with name ", node_name, " starting with kOutputPHName is neither Identity nor Retval, instead ", node_def.op()); } // Get output type that TensorFlow expects TFAttrs attrs(node_def); diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index 6bb73e2b3d8..f2d8a7ef9fc 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h" +#include "tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/op.h" @@ -90,8 +91,11 @@ class TRTEngineOp : public AsyncOpKernel { void ExecuteCalibration(OpKernelContext* ctx, AsyncHelper* helper); // Construct a function handle for executing native funcdef graph + // These are the exact same function. Status ConstructFunctionHandle(OpKernelContext* ctx); + Status ConstructFunctionHandle(OpKernelConstruction* ctx); + // Execute replaced native segment as function Op. void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper); @@ -120,6 +124,12 @@ class TRTEngineOp : public AsyncOpKernel { std::vector input_nodes_; std::vector output_nodes_; + // The id's in these vectors are used for getting slot numbers and + // node names after they are uniquified in graph->graphdef conversion. + + std::vector input_node_ids_; + std::vector output_node_ids_; + // serialized protobuf segment or trt engine depending on static_engine_ flag. string serialized_segment_; @@ -194,6 +204,29 @@ Status TRTEngineOp::ConstructFunctionHandle(OpKernelContext* ctx) { &native_func_); } +Status TRTEngineOp::ConstructFunctionHandle(OpKernelConstruction* ctx) { + VLOG(1) << "Constructing function handle"; + auto lib = ctx->function_library(); + if (lib == nullptr) { + return errors::Internal("Context function library is null"); + } + auto func_names = lib->GetFunctionLibraryDefinition()->ListFunctionNames(); + for (auto func_name : func_names) { + VLOG(0) << "Func name: " << func_name; + } + auto fdef = lib->GetFunctionLibraryDefinition()->Find(funcdef_name_); + if (fdef == nullptr) { + return errors::Internal("Native FunctionDef ", funcdef_name_, + " can't be found in function library"); + } + FunctionLibraryRuntime::InstantiateOptions inst_ops; + inst_ops.state_handle = ""; + inst_ops.target = ctx->device()->name(); + native_func_ = 0; + return lib->Instantiate(funcdef_name_, AttrSlice(&fdef->attr()), inst_ops, + &native_func_); +} + TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : AsyncOpKernel(context) { // read serialized_engine @@ -202,7 +235,7 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) OP_REQUIRES_OK(context, context->GetAttr("workspace_size_bytes", &workspace_size_)); OP_REQUIRES_OK(context, context->GetAttr("static_engine", &static_engine_)); - if (!static_engine_) { + /*if (!static_engine_) { OP_REQUIRES(context, segment_graph_.ParseFromString(serialized_segment_), errors::InvalidArgument("Failed to parse segment graphdef!")); VLOG(1) << "Size of serialized GraphDef: " @@ -210,7 +243,8 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) string tmp; // Swap with temporary empty string to deallocate the CPU memory. serialized_segment_.swap(tmp); - } + }*/ + VLOG(1) << "Constructing " << name(); string precision_string; OP_REQUIRES_OK(context, @@ -224,6 +258,25 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) TrtPrecisionModeFromName(precision_string, &precision_mode_)); OP_REQUIRES_OK(context, context->GetAttr("use_calibration", &use_calibration_)); + native_func_ = kInvalidHandle; + if (!static_engine_) { + //TODO(phillip-kravtsov) error checking here: how? + VLOG(0) << "Funcdef_name: " << funcdef_name_; + VLOG(0) << "Static Engine? " << static_engine_; + Status status = ConstructFunctionHandle(context); + VLOG(0) << "Status: " << status; + FunctionLibraryRuntime* lib = context->function_library(); + VLOG(0) << "Funcdef to graphdef"; + FunctionDefToGraphDef(native_func_, lib, &segment_graph_, + &input_node_ids_, &output_node_ids_); + for (int id : input_node_ids_) { + VLOG(0) << "Input node id: " << id << " from engine " << name(); + } + for (int id : output_node_ids_) { + VLOG(0) << "Output node id: " << id << " from engine " << name(); + } + + } calibration_mode_ = (use_calibration_ && precision_mode_ == TrtPrecisionMode::INT8 && calibration_data.empty()); @@ -231,7 +284,6 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) calibrator_.reset(new TRTInt8Calibrator(calibration_data)); calibration_data.resize(0); } - native_func_ = kInvalidHandle; OP_REQUIRES_OK(context, context->GetAttr("max_cached_engines_count", &max_cached_engines_)); } @@ -300,7 +352,9 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx, const auto device_tensor = calib_res->device_tensors_.at(i).AccessTensor(ctx); CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes()); - input_data.emplace(StrCat(kInputPHName, i), data_address); + input_data.emplace(StrCat(kInputPHName, + static_engine_ ? i : input_node_ids_[i]), + data_address); } VLOG(2) << "Filled map for sending"; // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files @@ -437,9 +491,15 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx, // input. const int num_batch = ctx->input(0).shape().dim_size(0); const int num_binding = ctx->num_inputs() + ctx->num_outputs(); + for (int i = 0; i < num_binding; i++) { + auto binding_name = cuda_engine->getBindingName(i); + VLOG(0) << "Binding name for index " << i << " " << binding_name; + } + std::vector buffers(num_binding); + for (int i = 0; i < ctx->num_inputs(); i++) { - const string input_name = StrCat(kInputPHName, i); + const string input_name = StrCat(kInputPHName, static_engine_ ? i : input_node_ids_[i]); const int binding_index = cuda_engine->getBindingIndex(input_name.c_str()); if (binding_index == -1) { const string msg = @@ -481,7 +541,7 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx, for (int i = 0; i < ctx->num_outputs(); i++) { // Create an output tensor - const string output_name = StrCat(kOutputPHName, i); + const string output_name = StrCat(kOutputPHName, static_engine_ ? i : output_node_ids_[i]); const int binding_index = cuda_engine->getBindingIndex(output_name.c_str()); Tensor* output_tensor = nullptr; @@ -720,7 +780,7 @@ Status TRTEngineOp::AllocateCalibrationResources(OpKernelContext* ctx, "Unsupported data type encountered in input ", i); } cres->device_buffers_.emplace( - StrCat(kInputPHName, i), + StrCat(kInputPHName, static_engine_ ? i : input_node_ids_[i]), std::pair(device_address, device_tensor->TotalBytes())); } cres->calibrator_.reset( diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc index d859d5f957f..6205254c72a 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc @@ -23,10 +23,14 @@ limitations under the License. #include #include #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h" #include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h" #include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/kernels/ops_testutil.h" @@ -47,7 +51,6 @@ class TRTEngineOpTestBase : public OpsTestBase { // Create the GPU device. std::unique_ptr device( DeviceFactory::NewDevice("GPU", {}, "/job:worker/replica:0/task:0")); - // Create simple TF graph. Scope s = Scope::NewRootScope(); auto feed = ops::Placeholder(s.WithOpName("TensorRTInputPH_0"), dtype, @@ -58,6 +61,32 @@ class TRTEngineOpTestBase : public OpsTestBase { // Serialize the graph. TRTEngineOp will convert it using dynamic mode. GraphDef graph_def; TF_ASSERT_OK(s.ToGraphDef(&graph_def)); + /* + //VLOG(0) << "Beginning TRTEngineOpTest new code"; + */ + const string func_name = "myop_native_segment"; + Graph* graph = s.graph(); + Graph sgraph(graph->flib_def()); + TF_ASSERT_OK(convert::ModifyGraphForFunctionDef( + graph, graph_def, &sgraph)); + TF_ASSERT_OK(convert::RegisterModifiedGraphToFunctionLibrary(&sgraph, graph, + flib_def_->ToProto(), "myop")); + //TF_ASSERT_OK(convert::RegisterSegmentFunctionToFunctionLibrary(graph, graph_def, "myop")); + + //FunctionDefLibrary fdeflib; + //VLOG(0) << "Before converting graph to function def"; + //auto native_segment = fdeflib.add_function(); + + //GraphToFunctionDef(*graph, func_name, native_segment); + //VLOG(0) << "After conversion from graph to func def"; + /*(*native_segment + ->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr] + .set_b(true); + */ + + //graph->AddFunctionLibrary(fdeflib); + //VLOG(0) << native_segment->DebugString(); + PartialTensorShape shape({-1, -1}); // Create the op. @@ -67,8 +96,8 @@ class TRTEngineOpTestBase : public OpsTestBase { .Attr("input_shapes", {shape}) .Attr("output_shapes", {shape}) .Attr("static_engine", false) - .Attr("segment_funcdef_name", "") // no native fallback - .Attr("serialized_segment", graph_def.SerializeAsString()) + .Attr("segment_funcdef_name", func_name) // no native fallback + .Attr("serialized_segment", "")//graph_def.SerializeAsString()) .Attr("calibration_data", "") .Attr("max_cached_engines_count", max_cached_engines_count) .Attr("workspace_size_bytes", 1 << 20) diff --git a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc new file mode 100644 index 00000000000..38b39804113 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc @@ -0,0 +1,172 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h" +//#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/core/common_runtime/graph_optimizer.h" +#include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/platform/logging.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/ascii.h" + +namespace tensorflow { +namespace tensorrt { + +const char* const kInputPHName = "TensorRTInputPH_"; +const char* const kOutputPHName = "TensorRTOutputPH_"; +const char* const kInputPHNameLower = "tensorrtinputph_"; +const char* const kOutputPHNameLower = "tensorrtoutputph_"; + +string NewNameWithIOPrefix(const Node* n) { + if (absl::StartsWith(n->name(), kInputPHNameLower)){ + return strings::StrCat(kInputPHName, n->id()); + } + else if (absl::StartsWith(n->name(), kOutputPHNameLower)) { + return strings::StrCat(kOutputPHName, n->id()); + } + return strings::StrCat("n", n->id()); +} + +void ToGraphDefWithIOPrefix(const Graph* g, GraphDef* gdef) { + // This is the same function as in function.cc. However, it uses the + // NewName mapping above, which retains IO prefixes (kInputPHName etc) + gtl::InlinedVector inputs; + gdef->Clear(); + *gdef->mutable_versions() = g->versions(); + + std::vector start_nodes; + for (Node* n : g->nodes()) { + if (n->out_edges().empty()) { + start_nodes.push_back(n); + } + } + + ReverseDFSFrom(*g, start_nodes, nullptr, [gdef, &inputs](Node* n) { + if (!n->IsOp()) return; + NodeDef* ndef = gdef->add_node(); + ndef->set_name(NewNameWithIOPrefix(n)); + ndef->set_op(n->type_string()); + for (const auto& attr : n->attrs()) { + (*ndef->mutable_attr())[attr.first] = attr.second; + } + + if (!n->assigned_device_name().empty()) { + ndef->set_device(n->assigned_device_name()); + } else { + ndef->set_device(n->requested_device()); + } + + inputs.clear(); + inputs.resize(n->num_inputs()); + for (const Edge* e : n->in_edges()) { + if (e->IsControlEdge()) { + inputs.push_back(e); + } else { + if (inputs[e->dst_input()] == nullptr) { + inputs[e->dst_input()] = e; + } else { + LOG(WARNING) << "Malformed graph node. multiple input edges: " + << n->DebugString(); + } + } + } + // node->name() is merely NodeDef::name, which are not guaranteed + // to be unique and stable after optimization rewrites. Therefore, + // we use "n or " instead. + for (const Edge* e : inputs) { + if (e == nullptr) { + ndef->add_input("unknown"); + continue; + } + const string srcname = NewNameWithIOPrefix(e->src()); + if (!e->src()->IsOp()) { + } else if (e->IsControlEdge()) { + ndef->add_input(strings::StrCat("^", srcname)); + } else if (e->src_output() == 0) { + ndef->add_input(srcname); + } else { + ndef->add_input(strings::StrCat(srcname, ":", e->src_output())); + } + } + }); +} + +Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, + FunctionLibraryRuntime* flib_runtime, + GraphDef* graph_def, + std::vector* input_node_ids, + std::vector* output_node_ids) { + const FunctionLibraryDefinition* flib_def = flib_runtime->GetFunctionLibraryDefinition(); + const FunctionBody* fbody; + VLOG(0) << "Getting Function Body \n"; + VLOG(0) << "HANDLE" << handle; + fbody = flib_runtime->GetFunctionBody(handle); + //TF_RET_CHECK(*fbody) + std::unique_ptr graph(new Graph(flib_def)); + + CopyGraph(*fbody->graph, graph.get()); + + // Copied from compiler/xla/compile_xla.cc : + /* + OptimizerOptions opts; + opts.set_opt_level(OptimizerOptions::L0); + opts.set_do_common_subexpression_elimination(false); + opts.set_do_function_inlining(true); + opts.set_do_constant_folding(true); + GraphOptimizer optimizer(opts); + auto cf_consider_fn = [](const Node* n) { + for (const auto& output_arg : n->op_def().output_arg()) { + if (output_arg.type() == DT_VARIANT) { + return false; + } + } + return true; + }; + GraphOptimizer::Options graph_optimizer_options; + graph_optimizer_options.cf_consider_fn = cf_consider_fn; + + */ + //optimizer.Optimize(flib_runtime, flib_runtime->env(), + // /*device=*/nullptr, &graph, graph_optimizer_options); + + for (Node* n : graph->nodes()) { + auto id = n->id(); + if (n->IsArg()) { + VLOG(1) << "Arg Node id " << id; + input_node_ids->push_back(id); + } + if (n->IsRetval()) { + VLOG(1) << "Retval Node id " << id; + output_node_ids->push_back(id); + } + } + + ToGraphDefWithIOPrefix(graph.release(), graph_def); + + for (const auto node_def : graph_def->node()) { + string node_name = node_def.name(); + VLOG(0) << "NODENAME AFTER FROM FUNCDEF " << node_name << ", op=" << node_def.op(); + } + VLOG(0) << "Finished converting \n"; + + return Status::OK(); + +} + +} +} diff --git a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h new file mode 100644 index 00000000000..ffc702679e0 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h @@ -0,0 +1,42 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_FUNCDEF_TO_GRAPHDEF_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_FUNCDEF_TO_GRAPHDEF_H_ + +#include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/framework/function.h" + +#if GOOGLE_CUDA +#if GOOGLE_TENSORRT + +namespace tensorflow { + +namespace tensorrt { + +string NewNameWithIOPrefix(const Node* n); +void ToGraphDefWithIOPrefix(const Graph* g, GraphDef* gdef); +Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, + FunctionLibraryRuntime* flib_runtime, + GraphDef* graph_def, + std::vector* input_node_ids, + std::vector* output_node_ids); + +} // namespace tensorrt +} // namespace tensorflow + +#endif +#endif +#endif diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py index 6b72cbec9bd..a15657dd640 100644 --- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py +++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py @@ -560,19 +560,19 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): num_engines += 1 segment_funcdef_name = node.attr["segment_funcdef_name"].s function_name = node.name + "_native_segment" - if IsQuantizationWithCalibration(run_params): + is_dynamic_engine = not node.attr["static_engine"].b + if IsQuantizationWithCalibration(run_params) or is_dynamic_engine: self.assertNotEmpty(segment_funcdef_name, node.name) self.assertIn(function_name, functions) else: - self.assertEmpty(segment_funcdef_name, node.name) - self.assertNotIn(function_name, functions) + #self.assertEmpty(segment_funcdef_name, node.name) + self.assertTrue(len(node.attr["serialized_segment"].s), node.name) + #self.assertNotIn(function_name, functions) self.assertIn(node.name, expected_engines) - self.assertTrue(len(node.attr["serialized_segment"].s), node.name) self.assertEqual( self._ToBytes(run_params.precision_mode), node.attr["precision_mode"].s, node.name) - is_dynamic_engine = not node.attr["static_engine"].b self.assertEqual(run_params.dynamic_engine, is_dynamic_engine, node.name) self.assertEqual(node.attr["use_calibration"].b, From c2aae1f1f27b9a89806272384dc4e1c462bfcd3b Mon Sep 17 00:00:00 2001 From: vivek suryamurthy Date: Fri, 12 Jul 2019 14:56:15 +0200 Subject: [PATCH 0089/3053] Improving the documentation concerning the usage examples of various learning rate schedulers --- .../keras/optimizer_v2/learning_rate_schedule.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py index c3fb180ddbc..c620504b891 100644 --- a/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py +++ b/tensorflow/python/keras/optimizer_v2/learning_rate_schedule.py @@ -452,7 +452,7 @@ class InverseTimeDecay(LearningRateSchedule): decay_steps = 1.0 decay_rate = 0.5 learning_rate_fn = keras.optimizers.schedules.InverseTimeDecay( - initial_learning_rate, global_step, decay_steps, decay_rate) + initial_learning_rate, decay_steps, decay_rate) model.compile(optimizer=tf.keras.optimizers.SGD( learning_rate=learning_rate_fn), @@ -549,7 +549,7 @@ class CosineDecay(LearningRateSchedule): ```python decay_steps = 1000 lr_decayed_fn = tf.keras.experimental.CosineDecay( - initial_learning_rate, global_step, decay_steps) + initial_learning_rate, decay_steps) ``` You can pass this schedule directly into a `tf.keras.optimizers.Optimizer` @@ -640,7 +640,6 @@ class CosineDecayRestarts(LearningRateSchedule): lr_decayed_fn = ( tf.keras.experimental.CosineDecayRestarts( initial_learning_rate, - global_step, first_decay_steps)) ``` @@ -665,8 +664,6 @@ class CosineDecayRestarts(LearningRateSchedule): A 1-arg callable learning rate schedule that takes the current optimizer step and outputs the decayed learning rate, a scalar `Tensor` of the same type as `initial_learning_rate`. - Raises: - ValueError: if `global_step` is not supplied. """ super(CosineDecayRestarts, self).__init__() @@ -779,7 +776,7 @@ class LinearCosineDecay(LearningRateSchedule): decay_steps = 1000 lr_decayed_fn = ( tf.keras.experimental.LinearCosineDecay( - initial_learning_rate, global_step, decay_steps)) + initial_learning_rate, decay_steps)) ``` You can pass this schedule directly into a `tf.keras.optimizers.Optimizer` @@ -899,7 +896,7 @@ class NoisyLinearCosineDecay(LearningRateSchedule): decay_steps = 1000 lr_decayed_fn = ( tf.keras.experimental.NoisyLinearCosineDecay( - initial_learning_rate, global_step, decay_steps)) + initial_learning_rate, decay_steps)) ``` You can pass this schedule directly into a `tf.keras.optimizers.Optimizer` From 97aed86c922b4799b840c27f9a0725fb43353601 Mon Sep 17 00:00:00 2001 From: Billy Lamberta Date: Fri, 12 Jul 2019 10:03:23 -0700 Subject: [PATCH 0090/3053] formatting --- tensorflow/python/ops/image_ops_impl.py | 32 ++++++++++++------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index 16172455ae6..84f2aad3623 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -3568,23 +3568,23 @@ def crop_and_resize_v2(image, Returns: A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`. - Usage Example: - ```python - >> import tensorflow as tf - >> BATCH_SIZE = 1 - >> NUM_BOXES = 5 - >> IMAGE_HEIGHT = 256 - >> IMAGE_WIDTH = 256 - >> CHANNELS = 3 - >> CROP_SIZE = (24, 24) + Example: - >> image = tf.random.normal(shape=(BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS) ) - >> boxes = tf.random.uniform(shape=(NUM_BOXES, 4)) - >> box_indices = tf.random.uniform(shape=(NUM_BOXES,), minval=0, maxval=BATCH_SIZE, dtype=tf.int32) - >> output = tf.image.crop_and_resize(image, boxes, box_indices, CROP_SIZE) - >> print(output.shape) - (5, 24, 24, 3) - ``` + ```python + import tensorflow as tf + BATCH_SIZE = 1 + NUM_BOXES = 5 + IMAGE_HEIGHT = 256 + IMAGE_WIDTH = 256 + CHANNELS = 3 + CROP_SIZE = (24, 24) + + image = tf.random.normal(shape=(BATCH_SIZE, IMAGE_HEIGHT, IMAGE_WIDTH, CHANNELS) ) + boxes = tf.random.uniform(shape=(NUM_BOXES, 4)) + box_indices = tf.random.uniform(shape=(NUM_BOXES,), minval=0, maxval=BATCH_SIZE, dtype=tf.int32) + output = tf.image.crop_and_resize(image, boxes, box_indices, CROP_SIZE) + print(output.shape) #=> (5, 24, 24, 3) + ``` """ return gen_image_ops.crop_and_resize(image, boxes, box_indices, crop_size, method, extrapolation_value, name) From 95644131f481eee0356fe922b90e7ca08be2967e Mon Sep 17 00:00:00 2001 From: Yasir Modak <42785357+ymodak@users.noreply.github.com> Date: Fri, 12 Jul 2019 12:48:49 -0700 Subject: [PATCH 0091/3053] format image_ops_impl.py removed debug print() statement --- tensorflow/python/ops/image_ops_impl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index 84f2aad3623..5cb73d84873 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -3583,7 +3583,7 @@ def crop_and_resize_v2(image, boxes = tf.random.uniform(shape=(NUM_BOXES, 4)) box_indices = tf.random.uniform(shape=(NUM_BOXES,), minval=0, maxval=BATCH_SIZE, dtype=tf.int32) output = tf.image.crop_and_resize(image, boxes, box_indices, CROP_SIZE) - print(output.shape) #=> (5, 24, 24, 3) + output.shape #=> (5, 24, 24, 3) ``` """ return gen_image_ops.crop_and_resize(image, boxes, box_indices, crop_size, From 5f3d84ba24206c22e151cc762eb9b99e0554e5ad Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Fri, 12 Jul 2019 21:24:22 +0000 Subject: [PATCH 0092/3053] Review code review comments. --- tensorflow/core/platform/rocm_rocdl_path.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/platform/rocm_rocdl_path.h b/tensorflow/core/platform/rocm_rocdl_path.h index 92b119fe816..29650bf0992 100644 --- a/tensorflow/core/platform/rocm_rocdl_path.h +++ b/tensorflow/core/platform/rocm_rocdl_path.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_ROCM_ROCDL_PATH_H_ -#define THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_ROCM_ROCDL_PATH_H_ +#ifndef TENSORFLOW_CORE_PLATFORM_ROCM_ROCDL_PATH_H_ +#define TENSORFLOW_CORE_PLATFORM_ROCM_ROCDL_PATH_H_ #include "tensorflow/core/platform/types.h" @@ -29,4 +29,4 @@ string ROCDLRoot(); } // namespace tensorflow -#endif // THIRD_PARTY_TENSORFLOW_CORE_PLATFORM_ROCM_ROCDL_PATH_H_ +#endif // TENSORFLOW_CORE_PLATFORM_ROCM_ROCDL_PATH_H_ From 152fc8ca887ff4aa4a288d7fab52581e4583d619 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Fri, 12 Jul 2019 14:50:40 -0700 Subject: [PATCH 0093/3053] Added error checking in trt_engine_op.cc --- .../tf2tensorrt/kernels/trt_engine_op.cc | 22 +++---------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index f2d8a7ef9fc..f34c25ed509 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -260,22 +260,10 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) context->GetAttr("use_calibration", &use_calibration_)); native_func_ = kInvalidHandle; if (!static_engine_) { - //TODO(phillip-kravtsov) error checking here: how? - VLOG(0) << "Funcdef_name: " << funcdef_name_; - VLOG(0) << "Static Engine? " << static_engine_; - Status status = ConstructFunctionHandle(context); - VLOG(0) << "Status: " << status; + OP_REQUIRES_OK(context, ConstructFunctionHandle(context)); FunctionLibraryRuntime* lib = context->function_library(); - VLOG(0) << "Funcdef to graphdef"; - FunctionDefToGraphDef(native_func_, lib, &segment_graph_, - &input_node_ids_, &output_node_ids_); - for (int id : input_node_ids_) { - VLOG(0) << "Input node id: " << id << " from engine " << name(); - } - for (int id : output_node_ids_) { - VLOG(0) << "Output node id: " << id << " from engine " << name(); - } - + OP_REQUIRES_OK(context, FunctionDefToGraphDef(native_func_, lib, &segment_graph_, + &input_node_ids_, &output_node_ids_)); } calibration_mode_ = (use_calibration_ && precision_mode_ == TrtPrecisionMode::INT8 && @@ -491,10 +479,6 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx, // input. const int num_batch = ctx->input(0).shape().dim_size(0); const int num_binding = ctx->num_inputs() + ctx->num_outputs(); - for (int i = 0; i < num_binding; i++) { - auto binding_name = cuda_engine->getBindingName(i); - VLOG(0) << "Binding name for index " << i << " " << binding_name; - } std::vector buffers(num_binding); From 990f5cc727a7cdc3749761913db977256abb73d6 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Fri, 12 Jul 2019 15:24:18 -0700 Subject: [PATCH 0094/3053] Removed use_function_backup parameter. --- .../tf2tensorrt/convert/convert_graph.cc | 21 ++++-------- .../tf2tensorrt/convert/convert_graph.h | 2 -- .../convert/trt_optimization_pass.cc | 4 --- .../convert/trt_optimization_pass.h | 5 +-- .../tensorrt/test/quantization_mnist_test.py | 3 +- .../test/tf_trt_integration_test_base.py | 7 ++-- .../python/compiler/tensorrt/trt_convert.py | 21 +----------- .../compiler/tensorrt/trt_convert_test.py | 32 +++++++------------ 8 files changed, 22 insertions(+), 73 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index 0c2831df275..3f029161954 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -463,7 +463,6 @@ Status CreateTRTNode(const ConversionParams& params, } NodeDef trt_node; - //TODO(phillip-kravtsov): use_function_backup: fix this Status status = node_builder.Attr("input_shapes", input_shape_protos) .Attr("output_shapes", output_shape_protos) @@ -634,10 +633,9 @@ Status RegisterModifiedGraphToFunctionLibrary(Graph* sgraph, Graph* graph, (*native_segment ->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr] .set_b(true); - //TODO(phillip-kravtsov): set this back to 7 - if (VLOG_IS_ON(0)) { - VLOG(0) << engine_name << " Function_Def "; - VLOG(0) << native_segment->DebugString(); + if (VLOG_IS_ON(7)) { + VLOG(7) << engine_name << " Function_Def "; + VLOG(7) << native_segment->DebugString(); } VLOG(1) << "Adding funcdef to graphlib"; TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdeflib)); @@ -697,16 +695,9 @@ std::pair GetDeviceAndAllocator(const ConversionParams& params, // Entry function from optimization pass. Status ConvertAfterShapes(const ConversionParams& params) { // Sanity checks. - if (params.precision_mode == TrtPrecisionMode::INT8) { - if (params.use_calibration && !params.use_function_backup) { - return errors::InvalidArgument( - "Calibration requires enabling fallback to TF function execution."); - } - } else { - if (params.use_calibration) { - return errors::InvalidArgument( - "Calibration with FP32 or FP16 is not supported."); - } + if (params.precision_mode != TrtPrecisionMode::INT8 && params.use_calibration) { + return errors::InvalidArgument( + "Calibration requires enabling fallback to TF function execution."); } // Convert graphdef to graph. diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h index 74135e56cf4..f7674fb367c 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h @@ -47,8 +47,6 @@ struct ConversionParams { // maximum number of cached engines int max_cached_engines = 1; bool use_calibration = true; - // Whether to use function fallback for TRTEngineOp - bool use_function_backup = true; }; // Method to call from optimization pass diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc index 6af483d37cf..6296851d378 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc @@ -67,9 +67,6 @@ Status TRTOptimizationPass::Init( if (params.count("use_calibration")) { use_calibration_ = params.at("use_calibration").b(); } - if (params.count("use_function_backup")) { - use_function_backup_ = params.at("use_function_backup").b(); - } return Status::OK(); } @@ -259,7 +256,6 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster, cp.is_dyn_op = is_dynamic_op_; cp.max_cached_engines = max_cached_batches_; cp.use_calibration = use_calibration_; - cp.use_function_backup = use_function_backup_; auto status = ConvertAfterShapes(cp); VLOG(1) << "Returning from " << name_; return status; diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h index d3fd914b302..dbed5354f15 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h +++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h @@ -40,8 +40,7 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer { is_dynamic_op_(false), max_cached_batches_(1), max_workspace_size_bytes_(256LL << 20), - use_calibration_(true), - use_function_backup_(true) { + use_calibration_(true) { VLOG(1) << "Constructing " << name_; } @@ -71,8 +70,6 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer { int64_t max_workspace_size_bytes_; bool use_calibration_; - // Whether to allow TF function fallback path in TRTEngineOp. - bool use_function_backup_; }; } // namespace convert diff --git a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py index 56994617b90..d44a0ec7156 100644 --- a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py +++ b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py @@ -153,8 +153,7 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase): # runtime to allocate GPU memory. max_workspace_size_bytes=1 << 28, minimum_segment_size=2, - use_calibration=False, - use_function_backup=False) + use_calibration=False) graph_def = converter.convert() logging.info('Number of nodes after TF-TRT conversion: %d', len(graph_def.node)) diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py index a15657dd640..a41f965573a 100644 --- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py +++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py @@ -234,10 +234,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): is_dynamic_op=run_params.dynamic_engine, maximum_cached_engines=1, use_calibration=run_params.use_calibration, - use_function_backup=False, max_batch_size=min(batch_list)) - return conversion_params._replace( - use_function_backup=IsQuantizationWithCalibration(conversion_params)) + return conversion_params def ShouldRunTest(self, run_params): """Whether to run the test.""" @@ -388,8 +386,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): minimum_segment_size=conversion_params.minimum_segment_size, is_dynamic_op=conversion_params.is_dynamic_op, maximum_cached_engines=conversion_params.maximum_cached_engines, - use_calibration=conversion_params.use_calibration, - use_function_backup=conversion_params.use_function_backup) + use_calibration=conversion_params.use_calibration) def _GetCalibratedInferGraph(self, run_params, saved_model_dir, inputs_data): """Return trt converted graphdef in INT8 mode.""" diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py index b3befd69849..3e07c161a06 100644 --- a/tensorflow/python/compiler/tensorrt/trt_convert.py +++ b/tensorflow/python/compiler/tensorrt/trt_convert.py @@ -144,11 +144,6 @@ TrtConversionParams = collections.namedtuple( # trained with fake quantization. "use_calibration", - # If set to True, it will create a FunctionDef for each subgraph that is - # converted to TRT op, and if TRT ops fail to execute at runtime, it'll - # invoke that function as a fallback. - "use_function_backup", - # Max size for the input batch. # This option is deprecated in TF 2.0. "max_batch_size", @@ -162,7 +157,6 @@ DEFAULT_TRT_CONVERSION_PARAMS = TrtConversionParams( is_dynamic_op=False, maximum_cached_engines=1, use_calibration=True, - use_function_backup=True, max_batch_size=1) _TRT_ENGINE_CACHE_CONTAINER_NAME = "TF-TRT-Engine-Cache" @@ -269,8 +263,6 @@ def get_tensorrt_rewriter_config( "maximum_cached_engines"].i = conversion_params.maximum_cached_engines optimizer.parameter_map[ "use_calibration"].b = conversion_params.use_calibration - optimizer.parameter_map[ - "use_function_backup"].b = conversion_params.use_function_backup if is_v2: # Static mode (a.k.a pre-generating TRT engines and make them node @@ -328,8 +320,7 @@ class TrtGraphConverter(object): minimum_segment_size=3, is_dynamic_op=False, maximum_cached_engines=1, - use_calibration=True, - use_function_backup=True): + use_calibration=True): """Initialize the converter. Args: @@ -368,9 +359,6 @@ class TrtGraphConverter(object): will occur. Please note that accuracy may be negatively affected if there is a mismatch between which tensors TRT quantizes and which tensors were trained with fake quantization. - use_function_backup: if set to True, it will create a FunctionDef for each - subgraph that is converted to TRT op, and if TRT ops fail to execute at - runtime, it'll invoke that function as a fallback. Raises: ValueError: if the combination of the parameters is invalid. @@ -409,12 +397,6 @@ class TrtGraphConverter(object): "dynamic TRT ops only. Disregarding is_dynamic_op parameter.") is_dynamic_op = True - # TODO(laigd): consider provide a mechanism to remove the fallback path - # after calibration is done. - if self._need_calibration and not use_function_backup: - raise ValueError( - "Calibration requires enabling fallback to TF function execution.") - # TODO(laigd): # - Verify in int8 mode that maximum_cached_engines is set properly. # - If it fails to build the int8 engine it should return error. @@ -431,7 +413,6 @@ class TrtGraphConverter(object): is_dynamic_op=is_dynamic_op, maximum_cached_engines=maximum_cached_engines, use_calibration=use_calibration, - use_function_backup=use_function_backup, max_batch_size=max_batch_size) _check_conversion_params(self._conversion_params) diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py index 61ecd79beb2..cdd24ce041e 100644 --- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py +++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py @@ -200,8 +200,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase): max_batch_size=1, minimum_segment_size=3, is_dynamic_op=False, - maximum_cached_engines=1, - use_function_backup=False): + maximum_cached_engines=1): """Helper method to convert a GraphDef or SavedModel using TF-TRT.""" converter = trt_convert.TrtGraphConverter( input_saved_model_dir=input_saved_model_dir, @@ -215,8 +214,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase): else trt_convert.TrtPrecisionMode.FP32), minimum_segment_size=minimum_segment_size, is_dynamic_op=is_dynamic_op, - maximum_cached_engines=maximum_cached_engines, - use_function_backup=use_function_backup) + maximum_cached_engines=maximum_cached_engines) output_graph_def = converter.convert() if need_calibration: @@ -249,8 +247,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase): input_saved_model_dir=input_saved_model_dir, output_saved_model_dir=output_saved_model_dir, need_calibration=need_calibration, - is_dynamic_op=is_dynamic_op, - use_function_backup=need_calibration) + is_dynamic_op=is_dynamic_op) graph_defs_to_verify = [output_graph_def] if output_saved_model_dir: @@ -314,8 +311,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase): conversion_params=trt_convert.DEFAULT_TRT_CONVERSION_PARAMS._replace( precision_mode=trt_convert.TrtPrecisionMode.FP32, is_dynamic_op=True, - maximum_cached_engines=2, - use_function_backup=False)) + maximum_cached_engines=2)) @test_util.run_v2_only def testTrtGraphConverter_BasicConversion_v2(self): @@ -445,7 +441,6 @@ class TrtConvertTest(test_util.TensorFlowTestCase): def _TestRun(self, sess, batch_size, - use_function_backup=False, expect_engine_is_run=True): try: result = sess.run( @@ -454,7 +449,8 @@ class TrtConvertTest(test_util.TensorFlowTestCase): except errors.OpError as e: # This should happen only when fallback path is disabled and TRT engine # fails to run. - self.assertTrue(not use_function_backup and not expect_engine_is_run) + # TODO(phillip-kravtsov) Check what correct handling is + #self.assertTrue(not use_function_backup and not expect_engine_is_run) self.assertIn("Fallback path is disabled, for TRTEngineOp_0", str(e)) @test_util.deprecated_graph_mode_only @@ -486,8 +482,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase): input_saved_model_dir=input_saved_model_dir, output_saved_model_dir=output_saved_model_dir, is_dynamic_op=True, - maximum_cached_engines=2, - use_function_backup=False) # Disallow fallback. + maximum_cached_engines=2) # Test the output GraphDef. with ops.Graph().as_default(): @@ -513,7 +508,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase): # the max, it should evict an old engine and create a new one. self._TestRun(sess, 3) - def _TestStaticOp(self, use_function_backup): + def _TestStaticOp(self): if not is_tensorrt_enabled(): return @@ -524,8 +519,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase): output_graph_def = self._ConvertGraph( input_saved_model_dir=input_saved_model_dir, output_saved_model_dir=output_saved_model_dir, - maximum_cached_engines=2, # This is noop, added just for testing. - use_function_backup=use_function_backup) + maximum_cached_engines=2) # Test the output GraphDef. with ops.Graph().as_default(): @@ -536,14 +530,12 @@ class TrtConvertTest(test_util.TensorFlowTestCase): self._TestRun( sess, 1, - use_function_backup=use_function_backup, expect_engine_is_run=True) # Run with batch size 2, which exceed the max_batch_size, it should try # to fall back to TF function. self._TestRun( sess, 2, - use_function_backup=use_function_backup, expect_engine_is_run=False) # Test the output SavedModel @@ -555,23 +547,21 @@ class TrtConvertTest(test_util.TensorFlowTestCase): self._TestRun( sess, 1, - use_function_backup=use_function_backup, expect_engine_is_run=True) # Run with batch size 2, which exceed the max_batch_size, it should try # to fall back to TF function. self._TestRun( sess, 2, - use_function_backup=use_function_backup, expect_engine_is_run=False) @test_util.deprecated_graph_mode_only def testTrtGraphConverter_StaticOp_NoFallback(self): - self._TestStaticOp(use_function_backup=False) + self._TestStaticOp() @test_util.deprecated_graph_mode_only def testTrtGraphConverter_StaticOp_WithFallback(self): - self._TestStaticOp(use_function_backup=True) + self._TestStaticOp() if __name__ == "__main__": From 99b097705bde93d9021b08afb083383c8b3ff81f Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Fri, 12 Jul 2019 15:33:32 -0700 Subject: [PATCH 0095/3053] Removed excessively verbose logging from trt. --- tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc | 1 - tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc | 2 +- tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc | 4 ---- tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc | 4 ---- 4 files changed, 1 insertion(+), 10 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index 3f029161954..112966acb40 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -135,7 +135,6 @@ Status GetEngineInfo(const Graph* g, DeviceNameUtils::ParsedName parsed_name; const bool parse_succeeded = DeviceNameUtils::ParseFullName(node_device, &parsed_name); - VLOG(0) << node_device; if (!parse_succeeded || (parse_succeeded && parsed_name.type == "CPU")) { string msg; if (!parse_succeeded) { diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index f34c25ed509..4c1a2127fb3 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -212,7 +212,7 @@ Status TRTEngineOp::ConstructFunctionHandle(OpKernelConstruction* ctx) { } auto func_names = lib->GetFunctionLibraryDefinition()->ListFunctionNames(); for (auto func_name : func_names) { - VLOG(0) << "Func name: " << func_name; + VLOG(2) << "Func name: " << func_name; } auto fdef = lib->GetFunctionLibraryDefinition()->Find(funcdef_name_); if (fdef == nullptr) { diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc index 6205254c72a..dc31e5c156e 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc @@ -62,7 +62,6 @@ class TRTEngineOpTestBase : public OpsTestBase { GraphDef graph_def; TF_ASSERT_OK(s.ToGraphDef(&graph_def)); /* - //VLOG(0) << "Beginning TRTEngineOpTest new code"; */ const string func_name = "myop_native_segment"; Graph* graph = s.graph(); @@ -74,18 +73,15 @@ class TRTEngineOpTestBase : public OpsTestBase { //TF_ASSERT_OK(convert::RegisterSegmentFunctionToFunctionLibrary(graph, graph_def, "myop")); //FunctionDefLibrary fdeflib; - //VLOG(0) << "Before converting graph to function def"; //auto native_segment = fdeflib.add_function(); //GraphToFunctionDef(*graph, func_name, native_segment); - //VLOG(0) << "After conversion from graph to func def"; /*(*native_segment ->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr] .set_b(true); */ //graph->AddFunctionLibrary(fdeflib); - //VLOG(0) << native_segment->DebugString(); PartialTensorShape shape({-1, -1}); diff --git a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc index 38b39804113..af76d84b232 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc +++ b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc @@ -113,8 +113,6 @@ Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, std::vector* output_node_ids) { const FunctionLibraryDefinition* flib_def = flib_runtime->GetFunctionLibraryDefinition(); const FunctionBody* fbody; - VLOG(0) << "Getting Function Body \n"; - VLOG(0) << "HANDLE" << handle; fbody = flib_runtime->GetFunctionBody(handle); //TF_RET_CHECK(*fbody) std::unique_ptr graph(new Graph(flib_def)); @@ -160,9 +158,7 @@ Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, for (const auto node_def : graph_def->node()) { string node_name = node_def.name(); - VLOG(0) << "NODENAME AFTER FROM FUNCDEF " << node_name << ", op=" << node_def.op(); } - VLOG(0) << "Finished converting \n"; return Status::OK(); From c589417ff81aa59160684fdc84ffac44095ac82e Mon Sep 17 00:00:00 2001 From: Leslie-Fang Date: Mon, 15 Jul 2019 19:52:45 +0800 Subject: [PATCH 0096/3053] fix the issue when doing tf.Cast operation Fix the issue: https://github.com/tensorflow/tensorflow/issues/30691 --- tensorflow/core/grappler/optimizers/constant_folding.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc index 54ef5567197..6b7ceff65b2 100644 --- a/tensorflow/core/grappler/optimizers/constant_folding.cc +++ b/tensorflow/core/grappler/optimizers/constant_folding.cc @@ -1205,7 +1205,7 @@ Status ConstantFolding::CreateNodeDef(const string& name, case DT_INT64: POPULATE_TENSOR_PROTO(tensor, t, int64, int64); case DT_UINT64: - POPULATE_TENSOR_PROTO(tensor, t, uint64, int64); + POPULATE_TENSOR_PROTO(tensor, t, uint64, uint64); case DT_INT32: POPULATE_TENSOR_PROTO(tensor, t, int32, int); case DT_UINT32: From ac108e8789a2564d07675fd67bd827715f384ffd Mon Sep 17 00:00:00 2001 From: Bhavani Subramanian Date: Mon, 15 Jul 2019 12:01:59 -0700 Subject: [PATCH 0097/3053] Addressed review comments. --- tensorflow/core/kernels/mkl_qmatmul_op.cc | 4 +- tensorflow/core/util/mkl_util.h | 938 +++++++--------------- 2 files changed, 296 insertions(+), 646 deletions(-) diff --git a/tensorflow/core/kernels/mkl_qmatmul_op.cc b/tensorflow/core/kernels/mkl_qmatmul_op.cc index fc571602b35..4aff02ac827 100644 --- a/tensorflow/core/kernels/mkl_qmatmul_op.cc +++ b/tensorflow/core/kernels/mkl_qmatmul_op.cc @@ -737,8 +737,8 @@ class MklDnnQuantizedMatMulOp : public OpKernel { output_mkl_shape.SetMklTensor(true); output_mkl_shape.SetMklLayout(&dst_pd); output_mkl_shape.SetElemType(MklDnnType()); - output_mkl_shape.SetTfLayout2D(output_dims_mkl_order.size(), - output_dims_mkl_order, output_tf_format); + output_mkl_shape.SetTfLayout(output_dims_mkl_order.size(), + output_dims_mkl_order, output_tf_format, true); TensorShape output_tf_shape; output_tf_shape.AddDim((dst_pd.get_size() / sizeof(Toutput))); diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index 38aad335212..f37f3b8a4b7 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -123,45 +123,70 @@ enum class MklQuantization { static const int kSmallBatchSize = 32; #ifdef ENABLE_MKLDNN_V1 -// In MKL-DNN v1.0, the format (ex. NCHW) used to initialize a memory descriptor +#define ENGINE_CPU engine::kind::cpu +#define MEMORY_FORMAT memory::format_tag +#define MKL_TENSOR_FORMAT MklTensorFormat +#define MKL_TENSOR_FORMAT_BLOCKED MklTensorFormat::FORMAT_BLOCKED +#define MKL_TENSOR_FORMAT_INVALID MklTensorFormat::FORMAT_INVALID +#define MKL_TENSOR_FORMAT_NCDHW MklTensorFormat::FORMAT_NCDHW +#define MKL_TENSOR_FORMAT_NDHWC MklTensorFormat::FORMAT_NDHWC +#define MKL_TENSOR_FORMAT_NHWC MklTensorFormat::FORMAT_NHWC +#define MKL_TENSOR_FORMAT_NCHW MklTensorFormat::FORMAT_NCHW +#define MKL_TENSOR_FORMAT_UNDEF MKL_TENSOR_FORMAT_BLOCKED +#define MEMORY_DATA_TYPE_UNDEF memory::data_type::undef +#define MEMORY_PRIMITIVE_DESC memory::desc +#define TENSOR_FORMAT MKL_TENSOR_FORMAT +#define TENSOR_FORMAT_NHWC MKL_TENSOR_FORMAT_NHWC +#else +#define ENGINE_CPU engine::cpu +#define MEMORY_FORMAT memory::format +#define MKL_TENSOR_FORMAT memory::format +#define MKL_TENSOR_FORMAT_BLOCKED memory::format::blocked +#define MKL_TENSOR_FORMAT_INVALID memory::format::format_undef +#define MKL_TENSOR_FORMAT_NCDHW memory::format::ncdhw +#define MKL_TENSOR_FORMAT_NDHWC memory::format::ndhwc +#define MKL_TENSOR_FORMAT_NHWC memory::format::nhwc +#define MKL_TENSOR_FORMAT_NCHW memory::format::nchw +#define MKL_TENSOR_FORMAT_UNDEF MKL_TENSOR_FORMAT_INVALID +#define MEMORY_DATA_TYPE_UNDEF memory::data_type::data_undef +#define MEMORY_PRIMITIVE_DESC memory::primitive_desc +#define TENSOR_FORMAT TensorFormat +#define TENSOR_FORMAT_NHWC FORMAT_NHWC +#endif // ENABLE_MKLDNN_V1 + +#ifdef ENABLE_MKLDNN_V1 +// In MKL-DNN v1.x, the format (ex. NCHW) used to initialize a memory descriptor // (md) structure will no longer be recorded in its `format` field. Instead, it // will be set to a canonical `blocked` format for every fully described md. // // Currently, we query this `format` field while mapping MKL-DNN's data format // to TF's data format. Due to the above restriction, we will now get this data // format information from TF's `data_format` attribute (i.e. via -// `TensorFormat`) for MKL-DNN v1.0. +// `TensorFormat`) for MKL-DNN v1.x. // -// Since MKL-DNN operators such as ReLU do not have a `data_format` attribute -// (since they are in `blocked` format), we need to be able to distinguish -// between blocked and non-blocked formats. For this, we have defined a new -// enum called `MklTensorFormat` which is similar to `TensorFormat` but with -// an additional field called `FORMAT_UNDEF`, which could mean one of the -// following depending on the context: -// -// 1) Blocked format: as described above, this is needed for element-wise +// Some MKL-DNN operators such as ReLU do not have a `data_format` attribute +// since they are usually in `blocked` format. Therefore, in order to +// distinguish between blocked and non-blocked formats, we have defined a new +// enum called `MklTensorFormat` that is semantically similar to `TensorFormat` +// but with two additional fields namely: +// 1) FORMAT_BLOCKED: as described above, this is needed for element-wise // operators such as ReLU. -// 2) Invalid format: ex. unsupported format -// TODO(bhavanis): Do we need a separate field for invalid formats? +// 2) FORMAT_INVALID: for error-checking (ex. unsupported format) enum class MklTensorFormat { FORMAT_NHWC = 0, FORMAT_NCHW = 1, FORMAT_NDHWC = 2, FORMAT_NCDHW = 3, - FORMAT_UNDEF = 4, // either blocked or invalid + FORMAT_BLOCKED = 4, + FORMAT_INVALID = 5, }; -#endif -#ifdef ENABLE_MKLDNN_V1 // Forward declarations -TensorFormat MklDnn3DDataFormatToTFDataFormat(MklTensorFormat format); -TensorFormat MklDnnDataFormatToTFDataFormat(MklTensorFormat format); memory::format_tag MklTensorFormatToMklDnnDataFormat(MklTensorFormat format); -#else -// Forward declarations -TensorFormat MklDnn3DDataFormatToTFDataFormat(memory::format format); -TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format); -#endif +#endif // ENABLE_MKLDNN_V1 + +TensorFormat MklDnn3DDataFormatToTFDataFormat(MKL_TENSOR_FORMAT format); +TensorFormat MklDnnDataFormatToTFDataFormat(MKL_TENSOR_FORMAT format); memory::dims CalculateTFStrides(const memory::dims& dims_tf_order); memory::desc CreateBlockedMemDescHelper(const memory::dims& dim, const memory::dims& strides, @@ -191,30 +216,24 @@ inline std::ostream& operator<<(std::ostream& os, os << "FORMAT_NDHWC"; } else if (format == MklTensorFormat::FORMAT_NCDHW) { os << "FORMAT_NCDHW"; - } else if (format == MklTensorFormat::FORMAT_UNDEF) { - os << "FORMAT_UNDEF"; + } else if (format == MklTensorFormat::FORMAT_BLOCKED) { + os << "FORMAT_BLOCKED"; } else { os << "INVALID FORMAT"; } } -#endif +#endif // ENABLE_MKLDNN_V1 class MklDnnShape { private: typedef struct { - /// Flag to indicate if the tensor is an MKL tensor or not + // Flag to indicate if the tensor is an MKL tensor or not bool is_mkl_tensor_ = false; - /// Number of dimensions in Tensorflow format + // Number of dimensions in Tensorflow format size_t dimension_ = 0; - /// Required by MKLDNN for conversions mkldnn_dims_t sizes_; // Required by MKL for conversions -#ifdef ENABLE_MKLDNN_V1 - MklTensorFormat tf_data_format_ = MklTensorFormat::FORMAT_UNDEF; - memory::data_type T_ = memory::data_type::undef; -#else - memory::format tf_data_format_ = memory::format::format_undef; - memory::data_type T_ = memory::data_type::data_undef; -#endif + MKL_TENSOR_FORMAT tf_data_format_ = MKL_TENSOR_FORMAT_UNDEF; + memory::data_type T_ = MEMORY_DATA_TYPE_UNDEF; // MKL layout mkldnn_memory_desc_t mkl_md_; /// TF dimension corresponding to this MKL dimension @@ -257,7 +276,6 @@ class MklDnnShape { return true; } -#ifdef ENABLE_MKLDNN_V1 /// Equality function for MklDnnShape objects /// @return true if both are equal; false otherwise. inline bool operator==(const MklDnnShape& input_shape) const { @@ -265,37 +283,25 @@ class MklDnnShape { return false; } - // If input tensors are in Mkl layout, then we check for dimensions and + // If input tensors are in MKL layout, then we check for dimensions and // sizes. if (this->IsMklTensor()) { +#ifdef ENABLE_MKLDNN_V1 const mkldnn_memory_desc_t& cur_md = (this->GetMklLayout()).data; const mkldnn_memory_desc_t& input_shape_md = input_shape.GetMklLayout().data; return this->GetTfShape() == input_shape.GetTfShape() && mkldnn_memory_desc_equal(&cur_md, &input_shape_md); - } - - return true; - } #else - /// Equality function for MklDnnShape objects - /// @return true if both are equal; false otherwise. - inline bool operator==(const MklDnnShape& input_shape) const { - if (this->IsMklTensor() != input_shape.IsMklTensor()) { - return false; - } - - // If input tensors are in Mkl layout, then we check for dimensions and - // sizes. - if (this->IsMklTensor()) { return this->GetTfShape() == input_shape.GetTfShape() && CompareMklDnnLayouts(this->GetMklLayout(), input_shape.GetMklLayout()); +#endif // ENABLE_MKLDNN_V1 } + // Both inputs are not MKL tensors. return true; } -#endif /// Equality operator for MklDnnShape and TFShape. /// Returns: true if TF shapes for both are the same, false otherwise @@ -395,13 +401,9 @@ class MklDnnShape { CHECK_EQ(data_.is_mkl_tensor_, true); std::vector shape(data_.dimension_, -1); -#ifdef ENABLE_MKLDNN_V1 // As mentioned in the comment above, we now rely on TF's `data_format` // attribute to determine if TF shape is in blocked format or not. - if (data_.tf_data_format_ != MklTensorFormat::FORMAT_UNDEF) { -#else - if (data_.tf_data_format_ != memory::format::blocked) { -#endif + if (data_.tf_data_format_ != MKL_TENSOR_FORMAT_BLOCKED) { for (size_t idx = 0; idx < data_.dimension_; ++idx) { shape[idx] = data_.sizes_[TfDimIdx(idx)]; } @@ -424,12 +426,12 @@ class MklDnnShape { inline const memory::data_type GetElemType() { return data_.T_; } #ifndef ENABLE_MKLDNN_V1 - // Memory primitive descriptor is deprecated in MKL-DNN v1.0. + // Memory primitive descriptor is deprecated in MKL-DNN v1.x. inline void SetMklLayout(memory::primitive_desc* pd) { CHECK_NOTNULL(pd); data_.mkl_md_ = pd->desc().data; } -#endif +#endif // !ENABLE_MKLDNN_V1 inline void SetMklLayout(memory::desc* md) { CHECK_NOTNULL(md); @@ -440,8 +442,7 @@ class MklDnnShape { return memory::desc(data_.mkl_md_); } -#ifdef ENABLE_MKLDNN_V1 - inline MklTensorFormat GetTfDataFormat() const { + inline MKL_TENSOR_FORMAT GetTfDataFormat() const { return data_.tf_data_format_; } @@ -449,7 +450,7 @@ class MklDnnShape { /// We use lazy evaluation and create it only when needed. Input format can /// also be Blocked format. inline void SetTfLayout(size_t dims, const memory::dims& sizes, - MklTensorFormat format) { + MKL_TENSOR_FORMAT format, bool is_2d = false) { DCHECK_EQ(dims, sizes.size()) << "SetTfLayout: Number of dimensions does not" "match with dimension array"; @@ -458,24 +459,13 @@ class MklDnnShape { data_.sizes_[ii] = sizes[ii]; } data_.tf_data_format_ = format; - if (format != MklTensorFormat::FORMAT_UNDEF) { - SetTfDimOrder(dims, format); - } - } - - inline void SetTfLayout2D(size_t dims, const memory::dims& sizes, - MklTensorFormat format) { - DCHECK_EQ(dims, sizes.size()) - << "SetTfLayout2D: Number of dimensions does not" - "match with dimension array"; - data_.dimension_ = dims; - for (size_t ii = 0; ii < dims; ++ii) { - data_.sizes_[ii] = sizes[ii]; - } - data_.tf_data_format_ = format; - if (format != MklTensorFormat::FORMAT_UNDEF) { - data_.map_[0] = MklDnnDims::Dim_N; - data_.map_[1] = MklDnnDims::Dim_C; + if (format != MKL_TENSOR_FORMAT_BLOCKED) { + if (is_2d) { + data_.map_[0] = MklDnnDims::Dim_N; + data_.map_[1] = MklDnnDims::Dim_C; + } else { + SetTfDimOrder(dims, format); + } } } @@ -486,70 +476,20 @@ class MklDnnShape { } // Create Blocked memory desc if input TF format was set like that. - if (data_.tf_data_format_ == MklTensorFormat::FORMAT_UNDEF) { + if (data_.tf_data_format_ == MKL_TENSOR_FORMAT_BLOCKED) { auto strides = CalculateTFStrides(dims); return CreateBlockedMemDescHelper(dims, strides, data_.T_); } else { +#ifdef ENABLE_MKLDNN_V1 auto format_tag = MklTensorFormatToMklDnnDataFormat(data_.tf_data_format_); DCHECK_NE(format_tag, memory::format_tag::undef); return memory::desc(dims, data_.T_, format_tag); - } - } #else - inline memory::format GetTfDataFormat() const { - return data_.tf_data_format_; - } - - /// We don't create primitive_descriptor for TensorFlow layout now. - /// We use lazy evaluation and create it only when needed. Input format can - /// also be Blocked format. - inline void SetTfLayout(size_t dims, const memory::dims& sizes, - memory::format format) { - DCHECK_EQ(dims, sizes.size()) - << "SetTfLayout: Number of dimensions does not" - "match with dimension array"; - data_.dimension_ = dims; - for (size_t ii = 0; ii < dims; ii++) { - data_.sizes_[ii] = sizes[ii]; - } - data_.tf_data_format_ = format; - if (format != memory::format::blocked) { - SetTfDimOrder(dims, format); - } - } - - inline void SetTfLayout2D(size_t dims, const memory::dims& sizes, - memory::format format) { - DCHECK_EQ(dims, sizes.size()) - << "SetTfLayout2D: Number of dimensions does not" - "match with dimension array"; - data_.dimension_ = dims; - for (size_t ii = 0; ii < dims; ++ii) { - data_.sizes_[ii] = sizes[ii]; - } - data_.tf_data_format_ = format; - if (format != memory::format::blocked) { - data_.map_[0] = MklDnnDims::Dim_N; - data_.map_[1] = MklDnnDims::Dim_C; - } - } - - inline const memory::desc GetTfLayout() const { - memory::dims dims; - for (size_t ii = 0; ii < data_.dimension_; ii++) { - dims.push_back(data_.sizes_[ii]); - } - - // Create Blocked memory desc if input TF format was set like that. - if (data_.tf_data_format_ == memory::format::blocked) { - auto strides = CalculateTFStrides(dims); - return CreateBlockedMemDescHelper(dims, strides, data_.T_); - } else { return memory::desc(dims, data_.T_, data_.tf_data_format_); +#endif // ENABLE_MKLDNN_V1 } } -#endif inline const memory::desc GetCurLayout() const { return IsMklTensor() ? GetMklLayout() : GetTfLayout(); @@ -588,17 +528,10 @@ class MklDnnShape { } } -#ifdef ENABLE_MKLDNN_V1 - inline void SetTfDimOrder(const size_t dimension, MklTensorFormat format) { + inline void SetTfDimOrder(const size_t dimension, MKL_TENSOR_FORMAT format) { TensorFormat data_format = MklDnnDataFormatToTFDataFormat(format); SetTfDimOrder(dimension, data_format); } -#else - inline void SetTfDimOrder(const size_t dimension, memory::format format) { - TensorFormat data_format = MklDnnDataFormatToTFDataFormat(format); - SetTfDimOrder(dimension, data_format); - } -#endif inline const mkldnn_dim_t* GetTfToMklDimMap() const { return &data_.map_[0]; } inline size_t TfDimIdx(int index) const { return data_.map_[index]; } @@ -629,7 +562,7 @@ class MklDnnShape { return TfDimIdx(d) == MklDnnDims::Dim_H; } - /// Check if the TF-Mkl dimension ordering map specifies if the input + /// Check if the TF-MKL dimension ordering map specifies if the input /// tensor is in NCHW format. inline bool IsTensorInNCHWFormat() const { TensorFormat data_format = FORMAT_NCHW; @@ -639,7 +572,7 @@ class MklDnnShape { IsMklWidthDim(GetTensorDimIndex<2>(data_format, 'W'))); } - /// Check if the TF-Mkl dimension ordering map specifies if the input + /// Check if the TF-MKL dimension ordering map specifies if the input /// tensor is in NHWC format. inline bool IsTensorInNHWCFormat() const { TensorFormat data_format = FORMAT_NHWC; @@ -699,21 +632,19 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor, TF_CHECK_OK(context->allocate_temp(DataTypeToEnum::v(), output_shape, &output_tensor)); + engine cpu_engine(ENGINE_CPU, 0); #ifdef ENABLE_MKLDNN_V1 - engine cpu_engine(engine::kind::cpu, 0); stream cpu_stream(cpu_engine); -#else - auto cpu_engine = engine(engine::cpu, 0); -#endif +#endif // ENABLE_MKLDNN_V1 MklDnnData input(&cpu_engine); - // Get Mkl layout of input tensor. + // Get MKL layout of input tensor. auto input_mkl_md = mkl_shape.GetMklLayout(); auto output_tf_md = mkl_shape.GetTfLayout(); #ifndef ENABLE_MKLDNN_V1 - // Memory primitive descriptor is deprecated in MKL-DNN v1.0. + // Memory primitive descriptor is deprecated in MKL-DNN v1.x. auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine); -#endif +#endif // !ENABLE_MKLDNN_V1 input.SetUsrMem(input_mkl_md, &mkl_tensor); #ifdef ENABLE_MKLDNN_V1 @@ -721,9 +652,8 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor, if (input.IsReorderNeeded(output_tf_md)) { std::vector net; std::vector net_args; - DCHECK_EQ(input.CheckReorderToOpMem(output_tf_md, &output_tensor, net, - net_args, &cpu_engine), - true); + DCHECK(input.CheckReorderToOpMem(output_tf_md, &output_tensor, net, + net_args, &cpu_engine)); DCHECK_EQ(net.size(), net_args.size()); for (size_t i = 0; i < net.size(); ++i) { net.at(i).execute(cpu_stream, net_args.at(i)); @@ -736,7 +666,7 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor, CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net), true); stream(stream::kind::eager).submit(net).wait(); -#endif +#endif // ENABLE_MKLDNN_V1 } else { // If not, just forward input tensor to output tensor. CHECK(output_tensor.CopyFrom(mkl_tensor, output_shape)); @@ -840,19 +770,8 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, // Allocates a temp tensor and returns the data buffer for temporary storage. template -#ifdef ENABLE_MKLDNN_V1 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, - const memory::desc& md, void** buf_out) { - TensorShape tf_shape; - - tf_shape.AddDim(md.get_size() / sizeof(T) + 1); - OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum::v(), - tf_shape, tensor_out)); - *buf_out = static_cast(tensor_out->flat().data()); -} -#else -inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, - const memory::primitive_desc& pd, void** buf_out) { + const MEMORY_PRIMITIVE_DESC& pd, void** buf_out) { TensorShape tf_shape; tf_shape.AddDim(pd.get_size() / sizeof(T) + 1); @@ -860,7 +779,6 @@ inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, tf_shape, tensor_out)); *buf_out = static_cast(tensor_out->flat().data()); } -#endif template inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, @@ -869,12 +787,13 @@ inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, tf_shape, tensor_out)); } +inline void GetStridesFromSizes(TENSOR_FORMAT data_format, size_t* strides, + const size_t* sizes) { #ifdef ENABLE_MKLDNN_V1 -inline void GetStridesFromSizes(MklTensorFormat data_format, size_t* strides, - const size_t* sizes) { - DCHECK_NE(data_format, MklTensorFormat::FORMAT_UNDEF); + DCHECK_NE(data_format, MklTensorFormat::FORMAT_INVALID); +#endif // ENABLE_MKLDNN_V1 // MKL requires strides in NCHW - if (data_format == MklTensorFormat::FORMAT_NHWC) { + if (data_format == TENSOR_FORMAT_NHWC) { strides[0] = sizes[2]; strides[1] = sizes[0] * sizes[2]; strides[2] = 1; @@ -886,23 +805,6 @@ inline void GetStridesFromSizes(MklTensorFormat data_format, size_t* strides, strides[3] = sizes[0] * sizes[1] * sizes[2]; } } -#else -inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides, - const size_t* sizes) { - // MKL requires strides in NCHW - if (data_format == FORMAT_NHWC) { - strides[0] = sizes[2]; - strides[1] = sizes[0] * sizes[2]; - strides[2] = 1; - strides[3] = sizes[0] * sizes[1] * sizes[2]; - } else { - strides[0] = 1; - strides[1] = sizes[0]; - strides[2] = sizes[0] * sizes[1]; - strides[3] = sizes[0] * sizes[1] * sizes[2]; - } -} -#endif inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in, int idx_out) { @@ -1065,7 +967,7 @@ memory::data_type MklDnnType() { // Fails with an error if invalid data format. inline memory::format_tag MklTensorFormatToMklDnnDataFormat( MklTensorFormat format) { - DCHECK_NE(format, MklTensorFormat::FORMAT_UNDEF); + DCHECK_NE(format, MklTensorFormat::FORMAT_INVALID); using tag = memory::format_tag; if (format == MklTensorFormat::FORMAT_NHWC) return tag::nhwc; if (format == MklTensorFormat::FORMAT_NCHW) return tag::nchw; @@ -1073,18 +975,17 @@ inline memory::format_tag MklTensorFormatToMklDnnDataFormat( if (format == MklTensorFormat::FORMAT_NCDHW) return tag::ncdhw; return tag::undef; } -#endif +#endif // ENABLE_MKLDNN_V1 -#ifdef ENABLE_MKLDNN_V1 /// Map TensorFlow data format into MKL-DNN 3D data format /// @input: TensorFlow data format /// @return: MKL-DNN 3D data format corresponding to TensorFlow data format; /// Fails with an error if invalid data format. -inline MklTensorFormat TFDataFormatToMklDnn3DDataFormat(TensorFormat format) { - if (format == FORMAT_NHWC) return MklTensorFormat::FORMAT_NDHWC; - if (format == FORMAT_NCHW) return MklTensorFormat::FORMAT_NCDHW; +inline MKL_TENSOR_FORMAT TFDataFormatToMklDnn3DDataFormat(TensorFormat format) { + if (format == FORMAT_NHWC) return MKL_TENSOR_FORMAT_NDHWC; + if (format == FORMAT_NCHW) return MKL_TENSOR_FORMAT_NCDHW; TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format")); - return MklTensorFormat::FORMAT_UNDEF; // Invalid format + return MKL_TENSOR_FORMAT_INVALID; } /// Map TensorFlow data format into MKL-DNN data format @@ -1092,11 +993,11 @@ inline MklTensorFormat TFDataFormatToMklDnn3DDataFormat(TensorFormat format) { /// @input: TensorFlow data format /// @return: MKL-DNN data format corresponding to TensorFlow data format; /// Fails with an error if invalid data format. -inline MklTensorFormat TFDataFormatToMklDnnDataFormat(TensorFormat format) { - if (format == FORMAT_NHWC) return MklTensorFormat::FORMAT_NHWC; - if (format == FORMAT_NCHW) return MklTensorFormat::FORMAT_NCHW; +inline MKL_TENSOR_FORMAT TFDataFormatToMklDnnDataFormat(TensorFormat format) { + if (format == FORMAT_NHWC) return MKL_TENSOR_FORMAT_NHWC; + if (format == FORMAT_NCHW) return MKL_TENSOR_FORMAT_NCHW; TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format")); - return MklTensorFormat::FORMAT_UNDEF; // Invalid format + return MKL_TENSOR_FORMAT_INVALID; } /// Map MKL-DNN data format into TensorFlow data format @@ -1104,12 +1005,10 @@ inline MklTensorFormat TFDataFormatToMklDnnDataFormat(TensorFormat format) { /// @input: MKL-DNN data format /// @return: Tensorflow data format corresponding to MKL-DNN data format; /// Fails with an error if invalid data format. -inline TensorFormat MklDnnDataFormatToTFDataFormat(MklTensorFormat format) { - if (format == MklTensorFormat::FORMAT_NHWC || - format == MklTensorFormat::FORMAT_NDHWC) +inline TensorFormat MklDnnDataFormatToTFDataFormat(MKL_TENSOR_FORMAT format) { + if (format == MKL_TENSOR_FORMAT_NHWC || format == MKL_TENSOR_FORMAT_NDHWC) return FORMAT_NHWC; - else if (format == MklTensorFormat::FORMAT_NCHW || - format == MklTensorFormat::FORMAT_NCDHW) + if (format == MKL_TENSOR_FORMAT_NCHW || format == MKL_TENSOR_FORMAT_NCDHW) return FORMAT_NCHW; TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format")); @@ -1117,51 +1016,6 @@ inline TensorFormat MklDnnDataFormatToTFDataFormat(MklTensorFormat format) { // that we don't come here. return FORMAT_NHWC; } -#else -/// Map TensorFlow's data format into MKL-DNN 3D data format -/// @input: TensorFlow data format -/// @return: memory::format corresponding to TensorFlow data format; -/// Fails with an error if invalid data format. -inline memory::format TFDataFormatToMklDnn3DDataFormat(TensorFormat format) { - if (format == FORMAT_NHWC) - return memory::format::ndhwc; - else if (format == FORMAT_NCHW) - return memory::format::ncdhw; - TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format")); - return memory::format::format_undef; -} - -/// Map TensorFlow's data format into MKL-DNN data format -/// -/// @input: TensorFlow data format -/// @return: memory::format corresponding to TensorFlow data format; -/// Fails with an error if invalid data format. -inline memory::format TFDataFormatToMklDnnDataFormat(TensorFormat format) { - if (format == FORMAT_NHWC) - return memory::format::nhwc; - else if (format == FORMAT_NCHW) - return memory::format::nchw; - TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format")); - return memory::format::format_undef; -} - -/// Map MKL-DNN data format to TensorFlow's data format -/// -/// @input: memory::format -/// @return: Tensorflow data format corresponding to memory::format -/// Fails with an error if invalid data format. -inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) { - if (format == memory::format::nhwc || format == memory::format::ndhwc) - return FORMAT_NHWC; - else if (format == memory::format::nchw || format == memory::format::ncdhw) - return FORMAT_NCHW; - TF_CHECK_OK(Status(error::Code::INVALID_ARGUMENT, "Unsupported data format")); - - // Return to prevent compiler warnings, otherwise TF_CHECK_OK will ensure - // that we don't come here. - return FORMAT_NHWC; -} -#endif /// Map TensorShape object into memory::dims required by MKL-DNN /// @@ -1191,12 +1045,7 @@ inline memory::dims TFShapeToMklDnnDims(const TensorShape& shape) { inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape, TensorFormat format) { // Check validity of format. - CHECK_NE(TFDataFormatToMklDnnDataFormat(format), -#ifdef ENABLE_MKLDNN_V1 - MklTensorFormat::FORMAT_UNDEF); -#else - memory::format::format_undef); -#endif + CHECK_NE(TFDataFormatToMklDnnDataFormat(format), MKL_TENSOR_FORMAT_INVALID); int n = shape.dim_size(GetTensorDimIndex(format, 'N')); int c = shape.dim_size(GetTensorDimIndex(format, 'C')); @@ -1210,12 +1059,7 @@ inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape, inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape, TensorFormat format) { // Validate format. - CHECK_NE(TFDataFormatToMklDnn3DDataFormat(format), -#ifdef ENABLE_MKLDNN_V1 - MklTensorFormat::FORMAT_UNDEF); -#else - memory::format::format_undef); -#endif + CHECK_NE(TFDataFormatToMklDnn3DDataFormat(format), MKL_TENSOR_FORMAT_INVALID); int n = shape.dim_size(GetTensorDimIndex<3>(format, 'N')); int c = shape.dim_size(GetTensorDimIndex<3>(format, 'C')); @@ -1232,12 +1076,7 @@ inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape, inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims, TensorFormat format) { // Validate format. - CHECK_NE(TFDataFormatToMklDnnDataFormat(format), -#ifdef ENABLE_MKLDNN_V1 - MklTensorFormat::FORMAT_UNDEF); -#else - memory::format::format_undef); -#endif + CHECK_NE(TFDataFormatToMklDnnDataFormat(format), MKL_TENSOR_FORMAT_INVALID); int n = in_dims[GetTensorDimIndex(format, 'N')]; int c = in_dims[GetTensorDimIndex(format, 'C')]; @@ -1290,7 +1129,6 @@ inline padding_kind TFPaddingToMklDnnPadding(Padding pad) { return padding_kind::zero; } -#ifdef ENABLE_MKLDNN_V1 /// Helper function to create memory descriptor in Blocked format /// /// @input: Tensor dimensions @@ -1303,6 +1141,7 @@ inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim, const memory::dims& strides, memory::data_type dtype) { DCHECK_EQ(dim.size(), strides.size()); +#ifdef ENABLE_MKLDNN_V1 mkldnn_dim_t input_dims[dim.size()]; mkldnn_dim_t input_strides[dim.size()]; for (size_t i = 0; i < dim.size(); ++i) { @@ -1310,28 +1149,14 @@ inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim, input_strides[i] = strides[i]; } mkldnn_memory_desc_t md; - DCHECK(mkldnn_memory_desc_init_by_strides(&md, dim.size(), input_dims, - memory::convert_to_c(dtype), - input_strides) == 0) + DCHECK_EQ(mkldnn_memory_desc_init_by_strides(&md, dim.size(), input_dims, + memory::convert_to_c(dtype), + input_strides), + 0) << "Failed to create blocked memory descriptor"; - return memory::desc(md); -} #else -/// Helper function to create memory descriptor in Blocked format -/// -/// @input: Tensor dimensions -/// @input: strides corresponding to dimensions. One can use utility -/// function such as CalculateTFStrides to compute strides -/// for given dimensions. -/// @return: memory::desc object corresponding to blocked memory format -/// for given dimensions and strides. -inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim, - const memory::dims& strides, - memory::data_type dtype) { - CHECK_EQ(dim.size(), strides.size()); - // We have to construct memory descriptor in a C style. This is not at all - // ideal but MKLDNN does not offer any API to construct descriptor in + // ideal but MKL-DNN does not offer any API to construct descriptor in // blocked format except a copy constructor that accepts // mkldnn_memory_desc_t. mkldnn_memory_desc_t md; @@ -1349,10 +1174,9 @@ inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim, md.dims[i] = dim[i]; } md.layout_desc.blocking.offset_padding = 0; - +#endif // ENABLE_MKLDNN_V1 return memory::desc(md); } -#endif template inline primitive FindOrCreateReorder(const memory* from, const memory* to); @@ -1404,7 +1228,6 @@ class MklDnnData { void SetIs3DData(bool bIs3D_) { bIs3D = bIs3D_; } bool GetIs3D() { return bIs3D; } -#ifdef ENABLE_MKLDNN_V1 /// Set user memory primitive using specified dimensions, memory format tag /// and data_buffer. Function automatically uses element data type by using /// input type T used for creating call object. @@ -1413,40 +1236,17 @@ class MklDnnData { /// an operation. E.g., filter of Conv2D is of shape {1, 2, 3, 4}, and /// memory format tag HWIO, and the buffer that contains actual values is /// pointed by data_buffer. - inline void SetUsrMem(const memory::dims& dim, memory::format_tag fm, + inline void SetUsrMem(const memory::dims& dim, MEMORY_FORMAT fm, void* data_buffer = nullptr) { auto md = memory::desc(dim, MklDnnType(), fm); SetUsrMem(md, data_buffer); } -#else - /// Set user memory primitive using specified dimensions, memory format and - /// data_buffer. Function automatically uses element data type by using - /// input type T used for creating call object. - /// - /// In a nutshell, function allows user to describe the input tensor to - /// an operation. E.g., filter of Conv2D is of shape {1, 2, 3, 4}, and - /// memory format HWIO, and the buffer that contains actual values is - /// pointed by data_buffer. - inline void SetUsrMem(const memory::dims& dim, memory::format fm, - void* data_buffer = nullptr) { - auto md = memory::desc(dim, MklDnnType(), fm); - SetUsrMem(md, data_buffer); - } -#endif -#ifdef ENABLE_MKLDNN_V1 - inline void SetUsrMem(const memory::dims& dim, memory::format_tag fm, + inline void SetUsrMem(const memory::dims& dim, MEMORY_FORMAT fm, const Tensor* tensor) { - CHECK_NOTNULL(tensor); + DCHECK(tensor); SetUsrMem(dim, fm, GetTensorBuffer(tensor)); } -#else - inline void SetUsrMem(const memory::dims& dim, memory::format fm, - const Tensor* tensor) { - CHECK_NOTNULL(tensor); - SetUsrMem(dim, fm, GetTensorBuffer(tensor)); - } -#endif /// Helper function to create memory descriptor in Blocked format /// @@ -1481,7 +1281,7 @@ class MklDnnData { } #ifndef ENABLE_MKLDNN_V1 - /// Memory primitive descriptor is deprecated in MKL-DNN v1.0. + /// Memory primitive descriptor is deprecated in MKL-DNN v1.x. /// A version of function to set user memory primitive that accepts memory /// descriptor directly, instead of accepting dimensions and format. This /// function is more generic that the one above, but the function above is @@ -1490,7 +1290,7 @@ class MklDnnData { auto pd = memory::primitive_desc(md, *cpu_engine_); SetUsrMem(pd, data_buffer); } -#endif +#endif // !ENABLE_MKLDNN_V1 /// A version of SetUsrMem with memory descriptor and tensor inline void SetUsrMem(const memory::desc& md, const Tensor* tensor) { @@ -1498,75 +1298,63 @@ class MklDnnData { SetUsrMem(md, GetTensorBuffer(tensor)); } -#ifdef ENABLE_MKLDNN_V1 /// A version of function to set user memory type that accepts memory /// descriptor directly, instead of accepting dimensions and format. This /// function is more generic than the one above, but the function above is /// sufficient in most cases. - inline void SetUsrMem(const memory::desc& md, void* data_buffer = nullptr) { - CHECK_NOTNULL(cpu_engine_); - if (user_memory_) delete user_memory_; - // TODO(nhasabni): can we remove dynamic memory allocation? - if (data_buffer) { - user_memory_ = new memory(md, *cpu_engine_, data_buffer); - } else { - user_memory_ = new memory(md, *cpu_engine_); - } - } -#else - /// A version of function to set user memory primitive that accepts primitive - /// descriptor directly, instead of accepting dimensions and format. This - /// function is more generic that the one above, but the function above is - /// sufficient in most cases. - inline void SetUsrMem(const memory::primitive_desc& pd, + inline void SetUsrMem(const MEMORY_PRIMITIVE_DESC& pd, void* data_buffer = nullptr) { - CHECK_NOTNULL(cpu_engine_); + DCHECK(cpu_engine_); if (user_memory_) delete user_memory_; // TODO(nhasabni): can we remove dynamic memory allocation? if (data_buffer) { +#ifdef ENABLE_MKLDNN_V1 + user_memory_ = new memory(pd, *cpu_engine_, data_buffer); +#else user_memory_ = new memory(pd, data_buffer); +#endif // ENABLE_MKLDNN_V1 } else { +#ifdef ENABLE_MKLDNN_V1 + user_memory_ = new memory(pd, *cpu_engine_); +#else user_memory_ = new memory(pd); +#endif // ENABLE_MKLDNN_V1 } } -#endif #ifndef ENABLE_MKLDNN_V1 /// Memory primitive descriptor is deprecated in MKL-DNN v1.x /// A version of SetUsrMem with primitive descriptor and tensor inline void SetUsrMem(const memory::primitive_desc& pd, const Tensor* tensor) { - CHECK_NOTNULL(tensor); + DCHECK(tensor); SetUsrMem(pd, GetTensorBuffer(tensor)); } -#endif +#endif // !ENABLE_MKLDNN_V1 /// Get function for user memory primitive. inline const memory* GetUsrMem() const { return user_memory_; } #ifndef ENABLE_MKLDNN_V1 - /// Memory primitive descriptor is deprecated in MKL-DNN v1.0. + /// Memory primitive descriptor is deprecated in MKL-DNN v1.x. /// Get function for primitive descriptor of user memory primitive. inline const memory::primitive_desc GetUsrMemPrimDesc() const { - CHECK_NOTNULL(user_memory_); + DCHECK(user_memory_); return user_memory_->get_primitive_desc(); } -#endif +#endif // !ENABLE_MKLDNN_V1 -#ifdef ENABLE_MKLDNN_V1 /// Get function for descriptor of user memory. inline memory::desc GetUsrMemDesc() const { - CHECK_NOTNULL(user_memory_); +#ifdef ENABLE_MKLDNN_V1 + DCHECK(user_memory_); return user_memory_->get_desc(); - } #else - /// Get function for descriptor of user memory. - inline memory::desc GetUsrMemDesc() { // This is ugly. Why MKL-DNN does not provide desc() method of const type?? const memory::primitive_desc pd = GetUsrMemPrimDesc(); return const_cast(&pd)->desc(); +#endif // ENABLE_MKLDNN_V1 } -#endif /// Get function for data buffer of user memory primitive. inline void* GetUsrMemDataHandle() const { @@ -1608,56 +1396,36 @@ class MklDnnData { return reorder_memory_ ? *reorder_memory_ : *user_memory_; } -#ifdef ENABLE_MKLDNN_V1 /// Set memory descriptor of an operation in terms of dimensions and memory /// format. E.g., For Conv2D, the dimensions would be same as user dimensions /// but memory::format_tag would be mkldnn::any because we want MKL-DNN to /// choose the best layout/format for given input dimensions. - inline void SetOpMemDesc(const memory::dims& dim, memory::format_tag fm) { + inline void SetOpMemDesc(const memory::dims& dim, MEMORY_FORMAT fm) { // TODO(nhasabni): can we remove dynamic memory allocation? op_md_ = new memory::desc(dim, MklDnnType(), fm); } -#else - /// Set memory descriptor of an operation in terms of dimensions and memory - /// format. E.g., For Conv2D, the dimensions would be same as user dimensions - /// but memory::format would be mkldnn::any because we want MKL-DNN to choose - /// best layout/format for given input dimensions. - inline void SetOpMemDesc(const memory::dims& dim, memory::format fm) { - // TODO(nhasabni): can we remove dynamic memory allocation? - op_md_ = new memory::desc(dim, MklDnnType(), fm); - } -#endif /// Get function for memory descriptor for an operation inline const memory::desc& GetOpMemDesc() const { return *op_md_; } -#ifdef ENABLE_MKLDNN_V1 /// Predicate that checks if we need to reorder user's memory into memory /// pointed by op_md. /// /// @input: op_md - memory descriptor of the given input of an operation. /// @return: true in case reorder of input is needed; false, otherwise. - inline bool IsReorderNeeded(const memory::desc& op_md) const { - CHECK_NOTNULL(user_memory_); - return op_md != user_memory_->get_desc(); - } + inline bool IsReorderNeeded(const MEMORY_PRIMITIVE_DESC& op_pd) const { + DCHECK(user_memory_); +#ifdef ENABLE_MKLDNN_V1 + return op_pd != user_memory_->get_desc(); #else - /// Predicate that checks if we need to reorder user's memory into memory - /// pointed by op_pd. - /// - /// @input: op_pd - memory primitive descriptor of the given input of an - /// operation - /// @return: true in case reorder of input is needed; false, otherwise. - inline bool IsReorderNeeded(const memory::primitive_desc& op_pd) const { - CHECK_NOTNULL(user_memory_); return op_pd != user_memory_->get_primitive_desc(); +#endif // ENABLE_MKLDNN_V1 } -#endif #ifndef ENABLE_MKLDNN_V1 - /// In MKL-DNN v1.0, it it is not possible to directly compare two memory + /// In MKL-DNN v1.x, it it is not possible to directly compare two memory /// format tags since they only provide a partial description of the memory - /// layout. Hence, this function is disabled for MKL-DNN v1.0. + /// layout. Hence, this function is disabled for MKL-DNN v1.x. /// /// Predicate that checks if we need to reorder user's memory into memory /// based on the provided format. @@ -1670,7 +1438,7 @@ class MklDnnData { return target_format != user_memory_->get_primitive_desc().desc().data.format; } -#endif +#endif // !ENABLE_MKLDNN_V1 /// Function to create a reorder from memory pointed by from to memory pointed /// by to. Returns created primitive. @@ -1680,28 +1448,29 @@ class MklDnnData { return reorder(*from, *to); } +/// Function to handle input reordering +/// +/// Check if we need to reorder this input of an operation. +/// Return true and allocate reorder memory primitive if reorder is needed. +/// Otherwise, return false and do not allocate reorder memory primitive. +/// +/// To check if reorder is needed, this function compares memory primitive +/// descriptor (memory descriptor for v1.x) of an operation (op_pd) for +/// the given input with the user-specified memory descriptor. +/// +/// @input: op_pd - memory primitive descriptor of the given input of an +/// operation +/// @input: net - net to which to add reorder primitive in case it is needed. +/// @input: net_args - net to which user and reorder memories are added if +/// needed. Each entry is a key-value pair of the form +/// . +/// @return: true in case reorder of input is needed; false, otherwise. #ifdef ENABLE_MKLDNN_V1 - /// Function to handle input reordering - /// - /// Check if we need to reorder this input of an operation. - /// Return true and allocate reorder memory primitive if reorder is needed. - /// Otherwise, return false and do not allocate reorder memory primitive. - /// - /// To check if reorder is needed, this function compares memory descriptor - /// of an operation (op_md) for the given input with the - /// user-specified memory descriptor. - /// - /// @input: op_md - memory descriptor of the given input of an operation - /// @input: net - net to which to add reorder primitive in case it is needed. - /// @input: net_args - net to which user and reorder memories are added if - /// needed. Each entry is a key-value pair of the form - /// . - /// @return: true in case reorder of input is needed; false, otherwise. inline bool CheckReorderToOpMem(const memory::desc& op_md, std::vector& net, std::vector& net_args, const engine& engine) { - CHECK_NOTNULL(user_memory_); + DCHECK(user_memory_); DCHECK_EQ(net.size(), net_args.size()); if (IsReorderNeeded(op_md)) { // TODO(nhasabni): can we remove dynamic memory allocation? @@ -1709,47 +1478,28 @@ class MklDnnData { net.push_back(CreateReorder(user_memory_, reorder_memory_)); net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *user_memory_}, {MKLDNN_ARG_TO, *reorder_memory_}}); - return true; - } - return false; - } #else - /// Function to handle input reordering - /// - /// Check if we need to reorder this input of an operation. - /// Return true and allocate reorder memory primitive if reorder is needed. - /// Otherwise, return false and do not allocate reorder memory primitive. - /// - /// To check if reorder is needed, this function compares memory primitive - /// descriptor of an operation (op_pd) for the given input with the - /// user-specified memory primitive descriptor. - /// - /// @input: op_pd - memory primitive descriptor of the given input of an - /// operation - /// @input: net - net to which to add reorder primitive in case it is needed. - /// @return: true in case reorder of input is needed; false, otherwise. inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd, std::vector* net) { - CHECK_NOTNULL(net); - CHECK_NOTNULL(user_memory_); + DCHECK(net); + DCHECK(user_memory_); if (IsReorderNeeded(op_pd)) { - // TODO(nhasabni): can we remove dynamic memory allocation? reorder_memory_ = new memory(op_pd); net->push_back(CreateReorder(user_memory_, reorder_memory_)); +#endif // ENABLE_MKLDNN_V1 return true; } return false; } -#endif +/// TODO: this is a faster path with reorder primitive cache compared with +/// CheckReorderToOpMem(..., std::vector* net). +/// TODO(gzmkl): Remove the slower path. #ifdef ENABLE_MKLDNN_V1 /// TODO(bhavanis): Need to use reorder cache here for better performance. - /// TODO: this is a faster path with reorder primitive cache compared with - /// CheckReorderToOpMem(..., std::vector* net). - /// TODO(gzmkl): Remove the slower path. inline bool CheckReorderToOpMem(const memory::desc& op_md, const engine& engine) { - CHECK_NOTNULL(user_memory_); + DCHECK(user_memory_); if (IsReorderNeeded(op_md)) { // TODO(nhasabni): can we remove dynamic memory allocation? // primitive reuse don't allow two same reorder prim in @@ -1758,72 +1508,49 @@ class MklDnnData { stream cpu_stream(engine); reorder(*user_memory_, *reorder_memory_) .execute(cpu_stream, *user_memory_, *reorder_memory_); - return true; - } - return false; - } #else - /// This is a faster path with reorder primitive cache compared with - /// CheckReorderToOpMem(..., std::vector* net). - /// TODO(gzmkl): Remove the slower path. inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd) { CHECK_NOTNULL(user_memory_); if (IsReorderNeeded(op_pd)) { - // TODO(nhasabni): can we remove dynamic memory allocation? - // primitive reuse don't allow two same reorder prim in - // one stream, so submit it immediately reorder_memory_ = new memory(op_pd); std::vector net; net.push_back(FindOrCreateReorder(user_memory_, reorder_memory_)); stream(stream::kind::eager).submit(net).wait(); +#endif // ENABLE_MKLDNN_V1 return true; } return false; } -#endif +/// Overloaded version of above function that accepts memory buffer +/// where output of reorder needs to be stored. +/// +/// @input: op_pd - memory primitive descriptor (memory descriptor for v1.x) +/// of the given input of an operation +/// @reorder_data_handle - memory buffer where output of reorder needs to be +/// stored. Primitive does not check if buffer has +/// enough size to write. +/// @input: net - net to which to add reorder primitive in case it is needed. +/// @input: net_args - net to which user and reorder memories are added if +/// needed. Each entry is a key-value pair of the form +/// . +/// @input: engine - MKL-DNN's abstraction of a computational device +/// @return: true in case reorder of input is needed; false, otherwise. #ifdef ENABLE_MKLDNN_V1 - /// Overloaded version of above function that accepts memory buffer - /// where output of reorder needs to be stored. - /// - /// @input: op_md - memory descriptor of the given input of an operation - /// @reorder_data_handle - memory buffer where output of reorder needs to be - /// stored. Primitive does not check if buffer has - /// enough size to write. - /// @input: net - net to which to add reorder primitive in case it is needed. - /// @input: net_args - net to which user and reorder memories are added if - /// needed. Each entry is a key-value pair of the form - /// . - /// @input: engine - MKL-DNN's abstraction of a computational device - /// @return: true in case reorder of input is needed; false, otherwise. inline bool CheckReorderToOpMem(const memory::desc& op_md, void* reorder_data_handle, std::vector& net, std::vector& net_args, const engine& engine) { - CHECK_NOTNULL(reorder_data_handle); - CHECK_NOTNULL(user_memory_); + DCHECK(reorder_data_handle); + DCHECK(user_memory_); if (IsReorderNeeded(op_md)) { // TODO(nhasabni): can we remove dynamic memory allocation? reorder_memory_ = new memory(op_md, engine, reorder_data_handle); net.push_back(CreateReorder(user_memory_, reorder_memory_)); net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *user_memory_}, {MKLDNN_ARG_TO, *reorder_memory_}}); - return true; - } - return false; - } #else - /// Overloaded version of above function that accepts memory buffer - /// where output of reorder needs to be stored. - /// - /// @input: op_pd - memory primitive descriptor of the given input of an - /// operation - /// @reorder_data_handle - memory buffer where output of reorder needs to be - /// stored. Primitive does not check if buffer is - /// enough size to write. - /// @input: net - net to which to add reorder primitive in case it is needed. - /// @return: true in case reorder of input is needed; false, otherwise. inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd, void* reorder_data_handle, std::vector* net) { @@ -1831,22 +1558,24 @@ class MklDnnData { CHECK_NOTNULL(reorder_data_handle); CHECK_NOTNULL(user_memory_); if (IsReorderNeeded(op_pd)) { - // TODO(nhasabni): can we remove dynamic memory allocation? reorder_memory_ = new memory(op_pd, reorder_data_handle); net->push_back(CreateReorder(user_memory_, reorder_memory_)); +#endif // ENABLE_MKLDNN_V1 return true; } return false; } -#endif +/// This is a faster path with reorder primitive cache compared with +/// CheckReorderToOpMem(..., std::vector* net). +/// The slower path will be removed in the future #ifdef ENABLE_MKLDNN_V1 /// TODO(bhavanis): Need to use reorder cache here for better performance. inline bool CheckReorderToOpMem(const memory::desc& op_md, void* reorder_data_handle, const engine& engine) { - CHECK_NOTNULL(reorder_data_handle); - CHECK_NOTNULL(user_memory_); + DCHECK(reorder_data_handle); + DCHECK(user_memory_); if (IsReorderNeeded(op_md)) { // TODO(nhasabni): can we remove dynamic memory allocation? // primitive reuse don't allow two same reorder prim in @@ -1855,66 +1584,47 @@ class MklDnnData { stream cpu_stream(engine); reorder(*user_memory_, *reorder_memory_) .execute(cpu_stream, *user_memory_, *reorder_memory_); - return true; - } - return false; - } #else - /// This is a faster path with reorder primitive cache compared with - /// CheckReorderToOpMem(..., std::vector* net). - /// The slower path will be removed in the future inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd, void* reorder_data_handle) { CHECK_NOTNULL(reorder_data_handle); CHECK_NOTNULL(user_memory_); if (IsReorderNeeded(op_pd)) { - // TODO(nhasabni): can we remove dynamic memory allocation? - // primitive reuse don't allow two same reorder prim in - // one stream, so submit it immediately std::vector net; reorder_memory_ = new memory(op_pd, reorder_data_handle); net.push_back(FindOrCreateReorder(user_memory_, reorder_memory_)); stream(stream::kind::eager).submit(net).wait(); +#endif // ENABLE_MKLDNN_V1 return true; } return false; } -#endif +/// Another overloaded version of CheckReorderToOpMem that accepts Tensor +/// where output of reorder needs to be stored. +/// +/// @input: op_md - memory primitive descriptor (memory descriptor for v1.x) +/// of the given input of an operation +/// @reorder_tensor - Tensor whose buffer is to be used to store output of +/// reorder. Primitive does not check if buffer is +/// enough size to write. +/// @input: net - net to which to add reorder primitive in case it is needed. +/// @input: net_args - net to which user and reorder memories are added if +/// needed. Each entry is a key-value pair of the form +/// . +/// @input: engine - MKL-DNN's abstraction of a computational device +/// @return: true in case reorder of input is needed; false, otherwise. #ifdef ENABLE_MKLDNN_V1 - /// Another overloaded version of CheckReorderToOpMem that accepts Tensor - /// where output of reorder needs to be stored. - /// - /// @input: op_md - memory descriptor of the given input of an operation - /// @reorder_tensor - Tensor whose buffer is to be used to store output of - /// reorder. Primitive does not check if buffer is - /// enough size to write. - /// @input: net - net to which to add reorder primitive in case it is needed. - /// @input: net_args - net to which user and reorder memories are added if - /// needed. Each entry is a key-value pair of the form - /// . - /// @input: engine - MKL-DNN's abstraction of a computational device - /// @return: true in case reorder of input is needed; false, otherwise. inline bool CheckReorderToOpMem(const memory::desc& op_md, Tensor* reorder_tensor, std::vector& net, std::vector& net_args, const engine& engine) { - CHECK_NOTNULL(reorder_tensor); + DCHECK(reorder_tensor); return CheckReorderToOpMem(op_md, GetTensorBuffer(reorder_tensor), net, net_args, engine); } #else - /// Another overloaded version of CheckReorderToOpMem that accepts Tensor - /// where output of reorder needs to be stored. - /// - /// @input: op_pd - memory primitive descriptor of the given input of an - /// operation - /// @reorder_tensor - Tensor whose buffer is to be used to store output of - /// reorder. Primitive does not check if buffer is - /// enough size to write. - /// @input: net - net to which to add reorder primitive in case it is needed. - /// @return: true in case reorder of input is needed; false, otherwise. inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd, Tensor* reorder_tensor, std::vector* net) { @@ -1922,31 +1632,23 @@ class MklDnnData { CHECK_NOTNULL(reorder_tensor); return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor), net); } -#endif +#endif // ENABLE_MKLDNN_V1 -#ifdef ENABLE_MKLDNN_V1 /// TODO: this is a faster path with reorder primitive cache compared with /// CheckReorderToOpMem(op_md, reorder_tensor, net, net_args, engine), will /// remove /// slow path in the future - inline bool CheckReorderToOpMem(const memory::desc& op_md, + inline bool CheckReorderToOpMem(const MEMORY_PRIMITIVE_DESC& op_pd, Tensor* reorder_tensor) { - CHECK_NOTNULL(reorder_tensor); - return CheckReorderToOpMem(op_md, GetTensorBuffer(reorder_tensor), + DCHECK(reorder_tensor); +#ifdef ENABLE_MKLDNN_V1 + return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor), *cpu_engine_); - } #else - /// TODO: this is a faster path with reorder primitive cache compared with - /// CheckReorderToOpMem(..., std::vector* net), will remove - /// slow path in the future - inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd, - Tensor* reorder_tensor) { - CHECK_NOTNULL(reorder_tensor); return CheckReorderToOpMem(op_pd, GetTensorBuffer(reorder_tensor)); +#endif // ENABLE_MKLDNN_V1 } -#endif -#ifdef ENABLE_MKLDNN_V1 /// Function to handle output reorder /// /// This function performs very similar functionality as input reordering @@ -1957,89 +1659,65 @@ class MklDnnData { /// reorder is needed. And this temporary buffer will hold the output of /// an operation before it is fed to reorder primitive. /// - /// @input memory descriptor for the given output of an operation + /// @input - memory primitive descriptor (memory descriptor for v1.x) for the + /// given output of an operation /// @return: true in case reorder of output is needed; false, otherwise. - inline bool PrepareReorderToUserMemIfReq(const memory::desc& op_md) { - CHECK_NOTNULL(user_memory_); - if (IsReorderNeeded(op_md)) { - // TODO(nhasabni): can we remove dynamic memory allocation? - reorder_memory_ = new memory(op_md, *cpu_engine_); - return true; - } - return false; - } -#else - /// Function to handle output reorder - /// - /// This function performs very similar functionality as input reordering - /// function above. The only difference is that this function does not add - /// reorder primitive to the net. The reason for this is: the reorder - /// primitive for output needs to be added to the list only after operation - /// has executed. But we need to prepare a temporary buffer in case output - /// reorder is needed. And this temporary buffer will hold the output of - /// an operation before it is fed to reorder primitive. - /// - /// @input memory primitive descriptor for the given output of an operation - /// @return: true in case reorder of output is needed; false, otherwise. - inline bool PrepareReorderToUserMemIfReq( - const memory::primitive_desc& op_pd) { - CHECK_NOTNULL(user_memory_); + inline bool PrepareReorderToUserMemIfReq(const MEMORY_PRIMITIVE_DESC& op_pd) { + DCHECK(user_memory_); if (IsReorderNeeded(op_pd)) { - // TODO(nhasabni): can we remove dynamic memory allocation? +// TODO(nhasabni): can we remove dynamic memory allocation? +#ifdef ENABLE_MKLDNN_V1 + reorder_memory_ = new memory(op_pd, *cpu_engine_); +#else reorder_memory_ = new memory(op_pd); +#endif // ENABLE_MKLDNN_V1 return true; } return false; } -#endif +/// Function to actually insert reorder primitive in the net +/// +/// This function completes remaining part of output reordering. It inserts +/// a reordering primitive from the temporary buffer that holds the output +/// to the user-specified output buffer. +/// +/// @input: net - net to which to add reorder primitive +/// @input: net_args - net to which user and reorder memories are added if +/// needed. Each entry is a key-value pair of the form +/// . #ifdef ENABLE_MKLDNN_V1 - /// Function to actually insert reorder primitive in the net - /// - /// This function completes remaining part of output reordering. It inserts - /// a reordering primitive from the temporary buffer that holds the output - /// to the user-specified output buffer. - /// - /// @input: net - net to which to add reorder primitive - /// @input: net_args - net to which user and reorder memories are added if - /// needed. Each entry is a key-value pair of the form - /// . inline void InsertReorderToUserMem(std::vector& net, std::vector& net_args) { - CHECK_NOTNULL(user_memory_); - CHECK_NOTNULL(reorder_memory_); + DCHECK(user_memory_); + DCHECK(reorder_memory_); net.push_back(CreateReorder(reorder_memory_, user_memory_)); net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *reorder_memory_}, {MKLDNN_ARG_TO, *user_memory_}}); } #else - /// Function to actually insert reorder primitive in the net - /// - /// This function completes remaining part of output reordering. It inserts - /// a reordering primitive from the temporary buffer that holds the output - /// to the user-specified output buffer. - /// - /// @input: net - net to which to add reorder primitive inline void InsertReorderToUserMem(std::vector* net) { CHECK_NOTNULL(net); CHECK_NOTNULL(user_memory_); CHECK_NOTNULL(reorder_memory_); net->push_back(CreateReorder(reorder_memory_, user_memory_)); } -#endif +#endif // ENABLE_MKLDNN_V1 -#ifdef ENABLE_MKLDNN_V1 /// TODO: this is a faster path with reorder primitive cache compared with /// InsertReorderToUserMem(net, net_args), will remove /// slow path in the future inline void InsertReorderToUserMem() { - CHECK_NOTNULL(user_memory_); - CHECK_NOTNULL(reorder_memory_); - CHECK_NOTNULL(cpu_engine_); + DCHECK(user_memory_); + DCHECK(reorder_memory_); +#ifdef ENABLE_MKLDNN_V1 + DCHECK(cpu_engine_); stream cpu_stream(cpu_engine_); +#endif // ENABLE_MKLDNN_V1 // primitive reuse don't allow two same reorder prim in // one stream, so submit it immediately std::vector net; +#ifdef ENABLE_MKLDNN_V1 std::vector net_args; net.push_back(FindOrCreateReorder(reorder_memory_, user_memory_)); net_args.push_back(MemoryArgsMap{{MKLDNN_ARG_FROM, *reorder_memory_}, @@ -2049,21 +1727,11 @@ class MklDnnData { net.at(i).execute(cpu_stream, net_args.at(i)); } cpu_stream.wait(); - } #else - /// TODO: this is a faster path with reorder primitive cache compared with - /// InsertReorderToUserMem(std::vector* net), will remove - /// slow path in the future - inline void InsertReorderToUserMem() { - CHECK_NOTNULL(user_memory_); - CHECK_NOTNULL(reorder_memory_); - // primitive reuse don't allow two same reorder prim in - // one stream, so submit it immediately - std::vector net; net.push_back(FindOrCreateReorder(reorder_memory_, user_memory_)); stream(stream::kind::eager).submit(net).wait(); +#endif // ENABLE_MKLDNN_V1 } -#endif }; /// Base class for operations with reuse of primitives @@ -2256,41 +1924,20 @@ class FactoryKeyCreator { } }; -#ifdef ENABLE_MKLDNN_V1 -static inline memory::format_tag get_desired_format(int channel, - bool is_2d = true) { - memory::format_tag fmt_desired = memory::format_tag::any; +static inline MEMORY_FORMAT get_desired_format(int channel, bool is_2d = true) { + MEMORY_FORMAT fmt_desired = MEMORY_FORMAT::any; if (port::TestCPUFeature(port::CPUFeature::AVX512F)) { - fmt_desired = - is_2d ? memory::format_tag::nChw16c : memory::format_tag::nCdhw16c; + fmt_desired = is_2d ? MEMORY_FORMAT::nChw16c : MEMORY_FORMAT::nCdhw16c; } else if (port::TestCPUFeature(port::CPUFeature::AVX2) && (channel % 8) == 0) { - fmt_desired = - is_2d ? memory::format_tag::nChw8c - : memory::format_tag::ncdhw; // no avx2 support for 3d yet. + fmt_desired = is_2d ? MEMORY_FORMAT::nChw8c + : MEMORY_FORMAT::ncdhw; // no avx2 support for 3d yet. } else { - fmt_desired = is_2d ? memory::format_tag::nchw : memory::format_tag::ncdhw; + fmt_desired = is_2d ? MEMORY_FORMAT::nchw : MEMORY_FORMAT::ncdhw; } return fmt_desired; } -#else -static inline memory::format get_desired_format(int channel, - bool is_2d = true) { - memory::format fmt_desired = memory::format::any; - - if (port::TestCPUFeature(port::CPUFeature::AVX512F)) { - fmt_desired = is_2d ? memory::format::nChw16c : memory::format::nCdhw16c; - } else if (port::TestCPUFeature(port::CPUFeature::AVX2) && - (channel % 8) == 0) { - fmt_desired = is_2d ? memory::format::nChw8c - : memory::format::ncdhw; // no avx2 support for 3d yet. - } else { - fmt_desired = is_2d ? memory::format::nchw : memory::format::ncdhw; - } - return fmt_desired; -} -#endif class MklReorderPrimitive : public MklPrimitive { public: @@ -2315,30 +1962,24 @@ class MklReorderPrimitive : public MklPrimitive { : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {} } context_; -#ifdef ENABLE_MKLDNN_V1 - engine cpu_engine_ = engine(engine::kind::cpu, 0); -#else - engine cpu_engine_ = engine(engine::cpu, 0); -#endif + engine cpu_engine_ = engine(ENGINE_CPU, 0); -#ifdef ENABLE_MKLDNN_V1 - void Setup(const memory* from, const memory* to) { - context_.src_mem.reset( - new memory(from->get_desc(), cpu_engine_, DummyData)); - context_.dst_mem.reset(new memory(to->get_desc(), cpu_engine_, DummyData)); - context_.reorder_prim = std::make_shared( - reorder(*context_.src_mem, *context_.dst_mem)); - } -#else void Setup(const memory* from, const memory* to) { context_.src_mem.reset(new memory( +#ifdef ENABLE_MKLDNN_V1 + from->get_desc(), cpu_engine_, DummyData)); +#else {from->get_primitive_desc().desc(), cpu_engine_}, DummyData)); - context_.dst_mem.reset( - new memory({to->get_primitive_desc().desc(), cpu_engine_}, DummyData)); +#endif // ENABLE_MKLDNN_V1 + context_.dst_mem.reset(new memory( +#ifdef ENABLE_MKLDNN_V1 + to->get_desc(), cpu_engine_, DummyData)); +#else + {to->get_primitive_desc().desc(), cpu_engine_}, DummyData)); +#endif // ENABLE_MKLDNN_V1 context_.reorder_prim = std::make_shared( reorder(*context_.src_mem, *context_.dst_mem)); } -#endif }; template @@ -2365,59 +2006,53 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory { MklReorderPrimitiveFactory() {} ~MklReorderPrimitiveFactory() {} -#ifdef ENABLE_MKLDNN_V1 static string CreateKey(const memory* from, const memory* to) { string prefix = "reorder"; FactoryKeyCreator key_creator; +#ifdef ENABLE_MKLDNN_V1 auto const& from_desc = from->get_desc().data; auto const& to_desc = to->get_desc().data; - const int KIdxFirstStride = 0; - memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]); - memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]); - memory::dims from_strides( - from_desc.format_desc.blocking.strides, - &from_desc.format_desc.blocking.strides[from_desc.ndims]); - memory::dims to_strides( - to_desc.format_desc.blocking.strides, - &to_desc.format_desc.blocking.strides[to_desc.ndims]); - key_creator.AddAsKey(prefix); - // `format_kind` is not added since it will always set to `mkldnn_blocked` - key_creator.AddAsKey(static_cast(from_desc.data_type)); - key_creator.AddAsKey(from_dims); - key_creator.AddAsKey(from_strides); - key_creator.AddAsKey(static_cast(to_desc.data_type)); - key_creator.AddAsKey(to_dims); - key_creator.AddAsKey(to_strides); - return key_creator.GetKey(); - } #else - static string CreateKey(const memory* from, const memory* to) { - string prefix = "reorder"; - FactoryKeyCreator key_creator; auto const& from_desc = from->get_primitive_desc().desc().data; auto const& to_desc = to->get_primitive_desc().desc().data; +#endif // ENABLE_MKLDNN_V1 const int KIdxFirstStride = 0; memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]); memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]); memory::dims from_strides( +#ifdef ENABLE_MKLDNN_V1 + from_desc.format_desc.blocking.strides, + &from_desc.format_desc.blocking.strides[from_desc.ndims]); +#else from_desc.layout_desc.blocking.strides[KIdxFirstStride], &from_desc.layout_desc.blocking .strides[KIdxFirstStride][from_desc.ndims]); +#endif // ENABLE_MKLDNN_V1 memory::dims to_strides( +#ifdef ENABLE_MKLDNN_V1 + to_desc.format_desc.blocking.strides, + &to_desc.format_desc.blocking.strides[to_desc.ndims]); +#else to_desc.layout_desc.blocking.strides[KIdxFirstStride], &to_desc.layout_desc.blocking.strides[KIdxFirstStride][to_desc.ndims]); +#endif // ENABLE_MKLDNN_V1 key_creator.AddAsKey(prefix); +#ifndef ENABLE_MKLDNN_V1 + // `format_kind` is not added in v1.x since it will always set to + // `mkldnn_blocked` key_creator.AddAsKey(static_cast(from_desc.format)); +#endif // !ENABLE_MKLDNN_V1 key_creator.AddAsKey(static_cast(from_desc.data_type)); key_creator.AddAsKey(from_dims); key_creator.AddAsKey(from_strides); +#ifndef ENABLE_MKLDNN_V1 key_creator.AddAsKey(static_cast(to_desc.format)); +#endif // !ENABLE_MKLDNN_V1 key_creator.AddAsKey(static_cast(to_desc.data_type)); key_creator.AddAsKey(to_dims); key_creator.AddAsKey(to_strides); return key_creator.GetKey(); } -#endif MklPrimitive* GetReorder(const memory* from, const memory* to) { string key = CreateKey(from, to); @@ -2453,6 +2088,21 @@ inline bool IsConv1x1StrideNot1(memory::dims filter_dims, ((strides[0] != 1) || (strides[1] != 1))); } +#undef ENGINE_CPU +#undef MEMORY_FORMAT +#undef MKL_TENSOR_FORMAT +#undef MKL_TENSOR_FORMAT_BLOCKED +#undef MKL_TENSOR_FORMAT_INVALID +#undef MKL_TENSOR_FORMAT_NCDHW +#undef MKL_TENSOR_FORMAT_NDHWC +#undef MKL_TENSOR_FORMAT_NHWC +#undef MKL_TENSOR_FORMAT_NCHW +#undef MKL_TENSOR_FORMAT_UNDEF +#undef MEMORY_DATA_TYPE_UNDEF +#undef MEMORY_PRIMITIVE_DESC +#undef TENSOR_FORMAT +#undef TENSOR_FORMAT_NHWC + } // namespace tensorflow #endif // INTEL_MKL #endif // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_ From 558eb7868873317bac01afd3e1932886133db7fd Mon Sep 17 00:00:00 2001 From: Niranjan Hasabnis Date: Mon, 15 Jul 2019 17:29:30 -0700 Subject: [PATCH 0098/3053] Fixing static scan issue in mkl_layout_pass.cc --- tensorflow/core/graph/mkl_layout_pass.cc | 40 +++++++++++++----------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc index df3cf19e2c0..15a727a6c13 100644 --- a/tensorflow/core/graph/mkl_layout_pass.cc +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -3184,7 +3184,6 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr* g, // Create node. Node* new_node; TF_CHECK_OK(nb.Finalize(&**g, &new_node)); - CHECK_NOTNULL(new_node); // In the following code of this function, an unsorted set is used to make // sure no duplicated edges be added into the new node. Therefore, we can @@ -3375,7 +3374,8 @@ Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr* g, // Create node. Node* new_node; TF_CHECK_OK(nb.Finalize(&**g, &new_node)); - DCHECK(new_node); + // No need to check if new_node is null because it will be null only when + // Finalize fails. // Incoming data edges from 'pred' node and 'succ' node to new 'new_node' // node are already copied in BuildNode. @@ -3484,7 +3484,6 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad( // Create node. Node* new_node; TF_CHECK_OK(nb.Finalize(&**g, &new_node)); - CHECK_NOTNULL(new_node); // In the following code of this function, an unsorted set is used to make // sure no duplicated edges be added into the new node. Therefore, we can @@ -3641,7 +3640,6 @@ Status MklLayoutRewritePass::RewriteNodeForLayoutPropagation( if (s != Status::OK()) { return s; } - DCHECK(*new_node != nullptr); // In the following code of this function, an unsorted set is used to make // sure no duplicated edges be added into the new node. Therefore, we can @@ -3717,7 +3715,6 @@ Status MklLayoutRewritePass::RewriteNodeForJustOpNameChange( if (s != Status::OK()) { return s; } - DCHECK(*new_node != nullptr); // In the following code of this function, an unsorted set is used to make // sure no duplicated edges be added into the new node. Therefore, we can @@ -3774,7 +3771,6 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr* g, "RewriteNode will fail."); } TF_CHECK_OK(ret_status); - DCHECK(new_node != nullptr); // Copy the runtime device assigned from original code to new node. new_node->set_assigned_device_name(orig_node->assigned_device_name()); @@ -3793,19 +3789,24 @@ const MklLayoutRewritePass::RewriteInfo* MklLayoutRewritePass::CheckForQuantizedNodeRewrite(const Node* n) const { DataType T1, T2; DataType Tinput, Tfilter; + bool type_attrs_present = false; - if ((GetNodeAttr(n->def(), "Tinput", &Tinput).ok() && - GetNodeAttr(n->def(), "Tfilter", &Tfilter).ok()) || - (GetNodeAttr(n->def(), "T1", &T1).ok() && - GetNodeAttr(n->def(), "T2", &T2).ok())) { - if (mkl_op_registry::IsMklLayoutDependentOp( - mkl_op_registry::GetMklOpName(n->type_string()), T1, T2) || - mkl_op_registry::IsMklLayoutDependentOp( - mkl_op_registry::GetMklOpName(n->type_string()), Tinput, Tfilter)) { - for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) { - if (n->type_string().compare(ri->name) == 0 && ri->rewrite_rule(n)) { - return &*ri; - } + if (GetNodeAttr(n->def(), "Tinput", &Tinput).ok() && + GetNodeAttr(n->def(), "Tfilter", &Tfilter).ok() && + mkl_op_registry::IsMklLayoutDependentOp( + mkl_op_registry::GetMklOpName(n->type_string()), Tinput, Tfilter)) { + type_attrs_present = true; + } else if (GetNodeAttr(n->def(), "T1", &T1).ok() && + GetNodeAttr(n->def(), "T2", &T2).ok() && + mkl_op_registry::IsMklLayoutDependentOp( + mkl_op_registry::GetMklOpName(n->type_string()), T1, T2)) { + type_attrs_present = true; + } + + if (type_attrs_present) { + for (auto ri = rinfo_.cbegin(); ri != rinfo_.cend(); ++ri) { + if (n->type_string().compare(ri->name) == 0 && ri->rewrite_rule(n)) { + return &*ri; } } } @@ -3962,7 +3963,8 @@ Status MklLayoutRewritePass::FuseTransposeMklOpTranspose( // Create node. Node* new_node; TF_CHECK_OK(nb.Finalize(&**g, &new_node)); - DCHECK(new_node); + // No need to check if new_node is null because it will be null only when + // Finalize fails. // Fill outputs. for (const Edge* e : transpose_to_nchw->out_edges()) { From 54eb1054a1a4881e5b2b66e095b4299bcbc659e3 Mon Sep 17 00:00:00 2001 From: Matt Conley Date: Mon, 15 Jul 2019 22:58:27 -0700 Subject: [PATCH 0099/3053] Implement GetStats function for cuda malloc allocator --- .../core/common_runtime/gpu/gpu_cudamalloc_allocator.cc | 4 ++++ tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h | 1 + 2 files changed, 5 insertions(+) diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc index ea12a663b2f..491ef2ad8d2 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.cc @@ -61,6 +61,10 @@ void GPUcudaMallocAllocator::DeallocateRaw(void* ptr) { #endif // GOOGLE_CUDA } +absl::optional GPUcudaMallocAllocator::GetStats() { + return base_allocator_->GetStats(); +} + bool GPUcudaMallocAllocator::TracksAllocationSizes() const { return false; } } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h index 5025eed1213..b45d505c017 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_cudamalloc_allocator.h @@ -38,6 +38,7 @@ class GPUcudaMallocAllocator : public Allocator { void* AllocateRaw(size_t alignment, size_t num_bytes) override; void DeallocateRaw(void* ptr) override; bool TracksAllocationSizes() const override; + absl::optional GetStats() override; private: Allocator* base_allocator_ = nullptr; // owned From 19e931943c895830c92a40a92a66b376da6afb81 Mon Sep 17 00:00:00 2001 From: captain-pool Date: Tue, 16 Jul 2019 17:05:10 +0530 Subject: [PATCH 0100/3053] Added show function description feature for SavedModel 2.0 --- tensorflow/python/tools/saved_model_cli.py | 155 +++++++++++++++++---- 1 file changed, 126 insertions(+), 29 deletions(-) diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py index cdef42e2bf8..2d1b44e9034 100644 --- a/tensorflow/python/tools/saved_model_cli.py +++ b/tensorflow/python/tools/saved_model_cli.py @@ -36,12 +36,17 @@ from tensorflow.core.example import example_pb2 from tensorflow.core.framework import types_pb2 from tensorflow.python.client import session from tensorflow.python.debug.wrappers import local_cli_wrapper +from tensorflow.python.eager import context from tensorflow.python.framework import meta_graph as meta_graph_lib from tensorflow.python.framework import ops as ops_lib +from tensorflow.python.framework import tensor_spec from tensorflow.python.lib.io import file_io from tensorflow.python.platform import app # pylint: disable=unused-import +from tensorflow.python.saved_model import load from tensorflow.python.saved_model import loader +from tensorflow.python.saved_model import save from tensorflow.python.tools import saved_model_utils +from tensorflow.python.util import nest # Set of ops to blacklist. _OP_BLACKLIST = set(['WriteFile', 'ReadFile', 'PrintV2']) @@ -116,7 +121,11 @@ def _get_outputs_tensor_info_from_meta_graph_def(meta_graph_def, return meta_graph_def.signature_def[signature_def_key].outputs -def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key, indent=0): +def _show_inputs_outputs( + saved_model_dir, + tag_set, + signature_def_key, + indent=0): """Prints input and output TensorInfos. Prints the details of input and output TensorInfos for the SignatureDef mapped @@ -137,24 +146,96 @@ def _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key, indent=0): meta_graph_def, signature_def_key) indent_str = ' ' * indent + def in_print(s): print(indent_str + s) in_print('The given SavedModel SignatureDef contains the following input(s):') for input_key, input_tensor in sorted(inputs_tensor_info.items()): in_print(' inputs[\'%s\'] tensor_info:' % input_key) - _print_tensor_info(input_tensor, indent+1) + _print_tensor_info(input_tensor, indent + 1) in_print('The given SavedModel SignatureDef contains the following ' 'output(s):') for output_key, output_tensor in sorted(outputs_tensor_info.items()): in_print(' outputs[\'%s\'] tensor_info:' % output_key) - _print_tensor_info(output_tensor, indent+1) + _print_tensor_info(output_tensor, indent + 1) in_print('Method name is: %s' % meta_graph_def.signature_def[signature_def_key].method_name) +def _show_defined_functions(saved_model_dir, indent=0): + if context.executing_eagerly(): + ops_lib.disable_eager_execution() + trackable_object = load.load(saved_model_dir) + indent_str = ' ' * indent + + def in_print(s): + print(indent_str + s) + print('Defined Functions:') + functions = save._AugmentedGraphView( + trackable_object).list_functions(trackable_object) + for name, function in functions.items(): + for concrete_functions in function._list_all_concrete_functions_for_serialization(): + args, kwargs = (concrete_functions.structured_input_signature) + in_print('Function Name: \'%s\'' % name) + in_print('Callable with:') + _print_args(args, indent=2) + + +def _print_args(arguments, indent=0): # Level is indent + indent_str = ' ' * indent + + def quotes(value): + is_quotes = '\'' * isinstance(value, str) + return is_quotes + value + is_quotes + + def in_print(s, end='\n'): + print(indent_str + s, end=end) + + def is_nested(args): + return nest.is_nested(args) and not isinstance(args, dict) + if is_nested(arguments): + for index, element in enumerate(arguments, 1): + if indent == 2: + in_print('Argument #%d' % index) + if isinstance(element, tensor_spec.TensorSpec): + _print_tensor_spec(element, indent) + elif is_nested(element): + in_print(' DType: %s' % type(element).__name__) + in_print(' Values: [', end='') + _print_args(element, indent + 1) + in_print(' ]') + elif isinstance(element, dict): + in_print(' DType: %s' % type(element).__name__) + in_print(' Values: {', end='') + for key, value in element.items(): + if is_nested(element): + in_print(' \'%s\': [' % str(key), end='') + _print_args(element, indent + 1) + in_print(' ]') + else: + in_print(' \'%s\': %s' % (str(key), quotes(value)), end='') + in_print(' }') + else: + in_print(' DType: %s' % type(element).__name__) + in_print(' Value: %s' % str(element)) + + +def _print_tensor_spec(tensor_spec, indent=0): + indent_str = ' ' * indent + + def in_print(s): + print(indent_str + s) + in_print( + ' %s: Tensor(shape=%s, dtype=%s, name=\'%s\')' % + (tensor_spec.name, + tensor_spec.shape, + tensor_spec.dtype.name, + tensor_spec.name)) + + def _print_tensor_info(tensor_info, indent=0): """Prints details of the given tensor_info. @@ -163,6 +244,7 @@ def _print_tensor_info(tensor_info, indent=0): indent: How far (in increments of 2 spaces) to indent each line output """ indent_str = ' ' * indent + def in_print(s): print(indent_str + s) @@ -200,6 +282,7 @@ def _show_all(saved_model_dir): print('\nsignature_def[\'' + signature_def_key + '\']:') _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key, indent=1) + _show_defined_functions(saved_model_dir, indent=1) def get_meta_graph_def(saved_model_dir, tag_set): @@ -433,8 +516,10 @@ def preprocess_input_exprs_arg_string(input_exprs_str): for input_raw in filter(bool, input_exprs_str.split(';')): if '=' not in input_exprs_str: - raise RuntimeError('--input_exprs "%s" format is incorrect. Please follow' - '"="' % input_exprs_str) + raise RuntimeError( + '--input_exprs "%s" format is incorrect. Please follow' + '"="' % + input_exprs_str) input_key, expr = input_raw.split('=', 1) # ast.literal_eval does not work with numpy expressions input_dict[input_key] = eval(expr) # pylint: disable=eval-used @@ -586,7 +671,8 @@ def load_inputs_from_input_arg_string(inputs_str, input_exprs_str, if input_tensor_key in tensor_key_feed_dict: warnings.warn( 'input_key %s has been specified with both --inputs and --input_exprs' - ' options. Value in --input_exprs will be used.' % input_tensor_key) + ' options. Value in --input_exprs will be used.' % + input_tensor_key) tensor_key_feed_dict[input_tensor_key] = py_expr_evaluated # When input is a tf.Example: @@ -637,10 +723,16 @@ def run(args): 'required') tensor_key_feed_dict = load_inputs_from_input_arg_string( args.inputs, args.input_exprs, args.input_examples) - run_saved_model_with_feed_dict(args.dir, args.tag_set, args.signature_def, - tensor_key_feed_dict, args.outdir, - args.overwrite, worker=args.worker, - init_tpu=args.init_tpu, tf_debug=args.tf_debug) + run_saved_model_with_feed_dict( + args.dir, + args.tag_set, + args.signature_def, + tensor_key_feed_dict, + args.outdir, + args.overwrite, + worker=args.worker, + init_tpu=args.init_tpu, + tf_debug=args.tf_debug) def scan(args): @@ -738,21 +830,24 @@ def create_parser(): parser_show.set_defaults(func=show) # run command - run_msg = ('Usage example:\n' - 'To run input tensors from files through a MetaGraphDef and save' - ' the output tensors to files:\n' - '$saved_model_cli show --dir /tmp/saved_model --tag_set serve \\\n' - ' --signature_def serving_default \\\n' - ' --inputs input1_key=/tmp/124.npz[x],input2_key=/tmp/123.npy ' - '\\\n' - ' --input_exprs \'input3_key=np.ones(2)\' \\\n' - ' --input_examples ' - '\'input4_key=[{"id":[26],"weights":[0.5, 0.5]}]\' \\\n' - ' --outdir=/out\n\n' - 'For more information about input file format, please see:\n' - 'https://www.tensorflow.org/guide/saved_model_cli\n') + run_msg = ( + 'Usage example:\n' + 'To run input tensors from files through a MetaGraphDef and save' + ' the output tensors to files:\n' + '$saved_model_cli show --dir /tmp/saved_model --tag_set serve \\\n' + ' --signature_def serving_default \\\n' + ' --inputs input1_key=/tmp/124.npz[x],input2_key=/tmp/123.npy ' + '\\\n' + ' --input_exprs \'input3_key=np.ones(2)\' \\\n' + ' --input_examples ' + '\'input4_key=[{"id":[26],"weights":[0.5, 0.5]}]\' \\\n' + ' --outdir=/out\n\n' + 'For more information about input file format, please see:\n' + 'https://www.tensorflow.org/guide/saved_model_cli\n') parser_run = subparsers.add_parser( - 'run', description=run_msg, formatter_class=argparse.RawTextHelpFormatter) + 'run', + description=run_msg, + formatter_class=argparse.RawTextHelpFormatter) parser_run.add_argument( '--dir', type=str, @@ -769,9 +864,10 @@ def create_parser(): required=True, metavar='SIGNATURE_DEF_KEY', help='key of SignatureDef to run') - msg = ('Loading inputs from files, in the format of \'=,' - ' or \'=[]\', separated by \';\'.' - ' The file format can only be from .npy, .npz or pickle.') + msg = ( + 'Loading inputs from files, in the format of \'=,' + ' or \'=[]\', separated by \';\'.' + ' The file format can only be from .npy, .npz or pickle.') parser_run.add_argument('--inputs', type=str, default='', help=msg) msg = ('Specifying inputs by python expressions, in the format of' ' "=\'\'", separated by \';\'. ' @@ -888,8 +984,9 @@ def create_parser(): '--minimum_segment_size', type=int, default=3, - help=('the minimum number of nodes required for a subgraph to be replaced' - 'in a TensorRT node')) + help=( + 'the minimum number of nodes required for a subgraph to be replaced' + 'in a TensorRT node')) parser_convert_with_tensorrt.add_argument( '--is_dynamic_op', type=bool, From 77fb8f9dd2cb730ded8983ebb7363c3c77f7834c Mon Sep 17 00:00:00 2001 From: captain-pool Date: Tue, 16 Jul 2019 20:17:48 +0530 Subject: [PATCH 0101/3053] Minor fixes --- tensorflow/python/tools/saved_model_cli.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py index 2d1b44e9034..6469464c45c 100644 --- a/tensorflow/python/tools/saved_model_cli.py +++ b/tensorflow/python/tools/saved_model_cli.py @@ -177,11 +177,13 @@ def _show_defined_functions(saved_model_dir, indent=0): functions = save._AugmentedGraphView( trackable_object).list_functions(trackable_object) for name, function in functions.items(): - for concrete_functions in function._list_all_concrete_functions_for_serialization(): + in_print('Function Name: \'%s\'' % name) + for index, concrete_functions in enumerate( + function._list_all_concrete_functions_for_serialization(), 1): args, kwargs = (concrete_functions.structured_input_signature) - in_print('Function Name: \'%s\'' % name) - in_print('Callable with:') - _print_args(args, indent=2) + in_print('Option #%d' % index) + in_print(' Callable with:') + _print_args(args, indent=3) def _print_args(arguments, indent=0): # Level is indent @@ -198,7 +200,7 @@ def _print_args(arguments, indent=0): # Level is indent return nest.is_nested(args) and not isinstance(args, dict) if is_nested(arguments): for index, element in enumerate(arguments, 1): - if indent == 2: + if indent == 3: in_print('Argument #%d' % index) if isinstance(element, tensor_spec.TensorSpec): _print_tensor_spec(element, indent) From cf52fed10c6e0bb0f25e148f46e4f42470ba7ab0 Mon Sep 17 00:00:00 2001 From: captain-pool Date: Tue, 16 Jul 2019 20:38:30 +0530 Subject: [PATCH 0102/3053] cleaned up codes --- tensorflow/python/tools/saved_model_cli.py | 27 ++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py index 6469464c45c..62de9946de2 100644 --- a/tensorflow/python/tools/saved_model_cli.py +++ b/tensorflow/python/tools/saved_model_cli.py @@ -166,13 +166,21 @@ def _show_inputs_outputs( def _show_defined_functions(saved_model_dir, indent=0): + """Prints the function definition of SavedModel2.0 located at saved_model_dir + + Args: + saved_model_dir: Directory containing the SavedModel to inspect. + indent: How far (in increments of 2 spaces) to indent each line of output. + """ if context.executing_eagerly(): + # Disable eager execution to prevent loading of checkpoints ops_lib.disable_eager_execution() trackable_object = load.load(saved_model_dir) indent_str = ' ' * indent def in_print(s): print(indent_str + s) + print('Defined Functions:') functions = save._AugmentedGraphView( trackable_object).list_functions(trackable_object) @@ -184,9 +192,18 @@ def _show_defined_functions(saved_model_dir, indent=0): in_print('Option #%d' % index) in_print(' Callable with:') _print_args(args, indent=3) + if kwargs: + _print_args(args, "Named Argument", indent=3) -def _print_args(arguments, indent=0): # Level is indent +def _print_args(arguments, argument_type="Argument", indent=0): + """Formats and prints the argument of the concrete functions defined in the model + + Args: + arguments: Arguments of the concrete functions. + argument_type: Type of Argument List to Format and print. + indent: How far (in increments of 2 spaces) to indent each line of output. + """ indent_str = ' ' * indent def quotes(value): @@ -201,7 +218,7 @@ def _print_args(arguments, indent=0): # Level is indent if is_nested(arguments): for index, element in enumerate(arguments, 1): if indent == 3: - in_print('Argument #%d' % index) + in_print('%s #%d' % (argument_type, index)) if isinstance(element, tensor_spec.TensorSpec): _print_tensor_spec(element, indent) elif is_nested(element): @@ -226,6 +243,12 @@ def _print_args(arguments, indent=0): # Level is indent def _print_tensor_spec(tensor_spec, indent=0): + """Prints details of the given tensor_spec. + + Args: + tensor_spec: TensorSpec object to be printed. + indent: How far (in increments of 2 spaces) to indent each line output + """ indent_str = ' ' * indent def in_print(s): From cc93b7d7d5a066c5dbc597a28760cbd0cc2eb73c Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Tue, 16 Jul 2019 10:25:27 -0500 Subject: [PATCH 0103/3053] Remove unused DISABLED_ON_CPU macro from tests. --- tensorflow/compiler/xla/tests/convolution_test.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc index b58d28ae582..4e7f9dd3c4d 100644 --- a/tensorflow/compiler/xla/tests/convolution_test.cc +++ b/tensorflow/compiler/xla/tests/convolution_test.cc @@ -1950,7 +1950,7 @@ class ConvolutionHloTest : public HloTestBase {}; // double datatype is not yet supported in ROCm XLA_TEST_F(ConvolutionHloTest, - DISABLED_ON_GPU_ROCM(DISABLED_ON_CPU(ConvolveF64Forward))) { + DISABLED_ON_GPU_ROCM(ConvolveF64Forward)) { constexpr char kHlo[] = R"( HloModule TestModule @@ -1976,7 +1976,7 @@ ENTRY Test { // double datatype is not yet supported in ROCm XLA_TEST_F(ConvolutionHloTest, - DISABLED_ON_GPU_ROCM(DISABLED_ON_CPU(ConvolveF64BackwardFilter))) { + DISABLED_ON_GPU_ROCM(ConvolveF64BackwardFilter)) { constexpr char kHlo[] = R"( HloModule TestModule @@ -1990,7 +1990,7 @@ ENTRY Test { // double datatype is not yet supported in ROCm XLA_TEST_F(ConvolutionHloTest, - DISABLED_ON_GPU_ROCM(DISABLED_ON_CPU(ConvolveF64BackwardInput))) { + DISABLED_ON_GPU_ROCM(ConvolveF64BackwardInput)) { constexpr char kHlo[] = R"( HloModule TestModule From aaa18c5bdbf55f0288289e58073d8260e764eb92 Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Tue, 16 Jul 2019 10:33:59 -0500 Subject: [PATCH 0104/3053] Enable 3D convolution tests on ROCm as it's now supported. --- tensorflow/compiler/xla/tests/convolution_test.cc | 4 +--- .../compiler/xla/tests/convolution_variants_test.cc | 8 ++------ 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc index 4e7f9dd3c4d..9e7b627a64d 100644 --- a/tensorflow/compiler/xla/tests/convolution_test.cc +++ b/tensorflow/compiler/xla/tests/convolution_test.cc @@ -408,9 +408,7 @@ class Convolve1D_1x2x5_1x2x2_WithPadding : public ConvolutionTest { TYPED_TEST_CASE(Convolve1D_1x2x5_1x2x2_WithPadding, TestTypes); TYPED_TEST(Convolve1D_1x2x5_1x2x2_WithPadding, Types) { this->RunTest(); } -// 5D tensors are not yet supported in ROCm -XLA_TEST_F(ConvolutionTest, - DISABLED_ON_GPU_ROCM(Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid)) { +XLA_TEST_F(ConvolutionTest, Convolve3D_1x4x2x3x3_2x2x2x3x3_Valid) { XlaBuilder builder(TestName()); std::vector input_dims = {1, 4, 2, 3, 3}; std::vector filter_dims = {2, 2, 2, 3, 3}; diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc index ff5503b08e9..ba3e9c436e3 100644 --- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc +++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc @@ -1330,9 +1330,7 @@ XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding1D) { ComputeAndCompareR3(&builder, {{{13, 24, 130}}}, {}, error_spec_); } -// 5D tensors are not yet supported in ROCm -XLA_TEST_F(ConvolutionVariantsTest, - DISABLED_ON_GPU_ROCM(BackwardInputEvenPadding3D)) { +XLA_TEST_F(ConvolutionVariantsTest, BackwardInputEvenPadding3D) { XlaBuilder builder(TestName()); auto gradients_flat = LiteralUtil::CreateR1({1}); @@ -1356,9 +1354,7 @@ XLA_TEST_F(ConvolutionVariantsTest, ComputeAndCompareLiteral(&builder, expected_literal, {}, error_spec_); } -// 5D tensors are not yet supported in ROCm -XLA_TEST_F(ConvolutionVariantsTest, - DISABLED_ON_GPU_ROCM(BackwardFilterEvenPadding3D)) { +XLA_TEST_F(ConvolutionVariantsTest, BackwardFilterEvenPadding3D) { XlaBuilder builder(TestName()); auto activations_flat = LiteralUtil::CreateR1({1, 2, 3, 4}); From 27f8281d722c3b638b60e5aeeb80a129a3734463 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Tue, 16 Jul 2019 11:08:37 -0700 Subject: [PATCH 0105/3053] Mild cleanup. --- tensorflow/compiler/tf2tensorrt/BUILD | 1 - tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD index bca101c4a53..7490f4e8d15 100644 --- a/tensorflow/compiler/tf2tensorrt/BUILD +++ b/tensorflow/compiler/tf2tensorrt/BUILD @@ -253,7 +253,6 @@ tf_cuda_library( ":utils", "//tensorflow/core:framework_headers_lib", "//tensorflow/core:framework_lite", - #"//tensorflow/core:framework", "//tensorflow/core/grappler:op_types", "//tensorflow/core:graph", "//tensorflow/core:gpu_runtime", diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index 112966acb40..6dbd210316b 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -441,7 +441,6 @@ Status CreateTRTNode(const ConversionParams& params, segment_string = string(static_cast(engine_data->data()), engine_data->size()); } else { - //segment_string = info.segment_graph_def.SerializeAsString(); segment_string = ""; } @@ -540,7 +539,8 @@ Status CreateTRTNode(const ConversionParams& params, Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, Graph* sgraph) { - //Graph sgraph(graph->flib_def()); + // sgraph is a graph for the segment, to be modified by this function + // graph is the input graph to be optimized by TRT. GraphConstructorOptions gcopts; TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(gcopts, segment, sgraph)); std::map io_nodes; From 0404f60b100a77059c5164d6da9953b6c18cb8f4 Mon Sep 17 00:00:00 2001 From: amoitra Date: Tue, 16 Jul 2019 13:31:45 -0700 Subject: [PATCH 0106/3053] Add check for depthwise fwd conv addressing test failures and reverting change for MatchBackwardInput --- .../xla/service/gpu/cudnn_conv_rewriter.cc | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc index ca8d63cbcc7..9e59b1290ed 100755 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc @@ -153,6 +153,15 @@ MatchBackwardFilter(HloInstruction* conv) { "to fold it to a backward filter convolution."; return no_match_result; } + auto rhs_in = + conv->mutable_operand(1)->shape().dimensions(kernel_input_feature_dim); + if ((conv->feature_group_count() > 1) && (rhs_in == 1) && + (input_batch_dim == output_batch_dim)) { + VLOG(1) << conv->ToString() + << " is a depthwise forward convolution. No need to fold to " + "backward filter."; + return no_match_result; + } // Step 3: fuse the matched HLOs into a backward convolution instruction. // @@ -279,6 +288,15 @@ MatchBackwardInput(HloInstruction* conv) { const auto no_match_result = std::make_tuple(false, Window(), ConvolutionDimensionNumbers(), nullptr); + // TODO(b/119479517): Theoretically cuDNN supports grouped convolutions also + // for the backward input convolution, but at least for now with version 7.1.4 + // it is slower. This needs to be re-evaluated for future cuDNN versions. + // Note that we already have the necessary code down below, the only thing to + // enable it is to remove the following early return. + if (conv->feature_group_count() > 1) { + return no_match_result; + } + // Match instruction pattern. CHECK_EQ(HloOpcode::kConvolution, conv->opcode()); HloInstruction* reverse_filter = conv->mutable_operand(1); From 5f44f3fd957409e2ea46f8db8e846be625cdbbfa Mon Sep 17 00:00:00 2001 From: amoitra Date: Tue, 16 Jul 2019 14:14:49 -0700 Subject: [PATCH 0107/3053] Enable Use of Cudnn APIs for Backward Input Grouped Convolutions --- .../compiler/xla/service/gpu/cudnn_conv_rewriter.cc | 9 --------- 1 file changed, 9 deletions(-) mode change 100644 => 100755 tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc old mode 100644 new mode 100755 index e81850db69e..4ab82d1f463 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc @@ -258,15 +258,6 @@ MatchBackwardInput(HloInstruction* conv) { const auto no_match_result = std::make_tuple(false, Window(), ConvolutionDimensionNumbers(), nullptr); - // TODO(b/119479517): Theoretically cuDNN supports grouped convolutions also - // for the backward input convolution, but at least for now with version 7.1.4 - // it is slower. This needs to be re-evaluated for future cuDNN versions. - // Note that we already have the necessary code down below, the only thing to - // enable it is to remove the following early return. - if (conv->feature_group_count() > 1) { - return no_match_result; - } - // Match instruction pattern. CHECK_EQ(HloOpcode::kConvolution, conv->opcode()); HloInstruction* reverse_filter = conv->mutable_operand(1); From 1679c2ab5d9ef4493f79b9bdbbe70bb08e2004ce Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Tue, 16 Jul 2019 14:54:06 -0700 Subject: [PATCH 0108/3053] More mild cleanup, removed unnecessary static condition.y --- tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc | 3 +-- tensorflow/compiler/tf2tensorrt/convert/convert_graph.h | 4 ---- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index 6dbd210316b..a1234b56e0a 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -783,8 +783,7 @@ Status ConvertAfterShapes(const ConversionParams& params) { engine_segments.push_back(std::move(curr_engine)); converted_segments.push_back(std::move(curr_segment)); - if (VLOG_IS_ON(8) && - curr_engine.engine_type == EngineInfo::EngineType::TRTStatic) { + if (VLOG_IS_ON(8)) { string fname = engine_segments.back().engine_name; StrAppend(&fname, ".pb"); std::fstream f; diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h index f7674fb367c..25bcb345ce5 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h @@ -56,10 +56,6 @@ Status ConvertAfterShapes(const ConversionParams& params); std::pair GetDeviceAndAllocator(const ConversionParams& params, const EngineInfo& engine); -/*Status RegisterSegmentFunctionToFunctionLibrary(Graph* graph, - const GraphDef& segment, - const string& engine_name); - */ Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, Graph* sgraph); From 2b681a72d1e785b3d3cbdc9f3f4fded627665f40 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 16 Jul 2019 23:59:59 +0000 Subject: [PATCH 0109/3053] Fix incorrect default values of tf.sparse.to_dense This fix tries to address the issue where tf.sparse.to_dense without specifying default value explicitly leads to TypeError: ``` import tensorflow as tf sample_string = tf.sparse.SparseTensor(indices=[[0, 0], [1, 2]], values=['a', 'b'], dense_shape=[3, 4]) tf.sparse.to_dense( sample_string ) ... TypeError: Expected string passed to parameter 'default_value' of op 'SparseToDense', got 0 of type 'int' instead. Error: Expected string, got 0 of type 'int' instead. ``` The issue was that tf.sparse.to_dense use 0 as the default value which does not work well with string. This fix changes from `default_value=0` -> `default_value=None` and use zeros instead. It consists of an API change though the change is backward compatible. This fix fixes 30750 Signed-off-by: Yong Tang --- tensorflow/python/ops/sparse_ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py index f6b26c80a10..043857f71b4 100644 --- a/tensorflow/python/ops/sparse_ops.py +++ b/tensorflow/python/ops/sparse_ops.py @@ -1430,7 +1430,7 @@ def sparse_reduce_sum_sparse(sp_input, @tf_export("sparse.to_dense", v1=["sparse.to_dense", "sparse_tensor_to_dense"]) @deprecation.deprecated_endpoints("sparse_tensor_to_dense") def sparse_tensor_to_dense(sp_input, - default_value=0, + default_value=None, validate_indices=True, name=None): """Converts a `SparseTensor` into a dense tensor. @@ -1470,6 +1470,8 @@ def sparse_tensor_to_dense(sp_input, TypeError: If `sp_input` is not a `SparseTensor`. """ sp_input = _convert_to_sparse_tensor(sp_input) + if default_value is None: + default_value = array_ops.zeros([], dtype=sp_input.dtype) return gen_sparse_ops.sparse_to_dense( sp_input.indices, From 6f0b851fc9dd160b92284b206536c1ae12c504b3 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 17 Jul 2019 00:03:43 +0000 Subject: [PATCH 0110/3053] Add test case for GitHub issue 30750 Signed-off-by: Yong Tang --- tensorflow/python/ops/sparse_ops_test.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tensorflow/python/ops/sparse_ops_test.py b/tensorflow/python/ops/sparse_ops_test.py index 992a330a959..c78aae3cfd0 100644 --- a/tensorflow/python/ops/sparse_ops_test.py +++ b/tensorflow/python/ops/sparse_ops_test.py @@ -125,6 +125,16 @@ class SparseOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase): epsilon = 1e-4 self.assertLess(gradient_checker.max_error(*grads), epsilon) + def testSparseTensorToDenseString(self): + sp = sparse_tensor.SparseTensor( + indices=[[0, 0], [1, 2]], + values=['a', 'b'], + dense_shape=[2, 3]) + dense = sparse_ops.sparse_tensor_to_dense(sp) + expected_dense = [['a', '', ''], ['', '', 'b']] + result_dense = self.evaluate(dense) + self.assertAllEqual(expected_dense, result_dense) + if __name__ == '__main__': googletest.main() From b29e92bd298d1dd1740f860671f8a0906f63b476 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Tue, 16 Jul 2019 20:06:04 -0700 Subject: [PATCH 0111/3053] Moved constant IO strings into class. Renamed method in funcdef_to_graphdef. Formatted, removed commenting. --- .../tf2tensorrt/convert/convert_graph.cc | 43 +++--- .../tf2tensorrt/convert/convert_graph.h | 7 +- .../tf2tensorrt/convert/convert_nodes.cc | 132 +++++++++--------- .../tf2tensorrt/convert/convert_nodes.h | 4 +- .../compiler/tf2tensorrt/convert/utils.h | 8 ++ .../tf2tensorrt/kernels/trt_engine_op.cc | 54 ++++--- .../tf2tensorrt/kernels/trt_engine_op_test.cc | 19 +-- .../tf2tensorrt/utils/funcdef_to_graphdef.cc | 74 ++++------ .../tf2tensorrt/utils/funcdef_to_graphdef.h | 13 +- .../test/tf_trt_integration_test_base.py | 2 - .../compiler/tensorrt/trt_convert_test.py | 2 - 11 files changed, 159 insertions(+), 199 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index a1234b56e0a..74d4da6df73 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -49,9 +49,9 @@ limitations under the License. #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" -#include "tensorflow/core/protobuf/config.pb.h" // NOLINT +#include "tensorflow/core/protobuf/config.pb.h" // NOLINT #include "tensorflow/core/protobuf/device_properties.pb.h" // NOLINT -#include "tensorflow/core/protobuf/rewriter_config.pb.h" // NOLINT +#include "tensorflow/core/protobuf/rewriter_config.pb.h" // NOLINT #include "tensorflow/core/util/device_name_utils.h" #if GOOGLE_CUDA @@ -66,6 +66,8 @@ using absl::StrCat; namespace { +//auto prefixes = IONamePrefixes(); + Status BuildNodeMap(const Graph& graph, std::unordered_map* node_map) { for (auto* node : graph.op_nodes()) { @@ -466,7 +468,8 @@ Status CreateTRTNode(const ConversionParams& params, .Attr("output_shapes", output_shape_protos) .Attr("static_engine", info.engine_type == EngineInfo::EngineType::TRTStatic) - .Attr("segment_funcdef_name", StrCat(info.engine_name, "_native_segment")) + .Attr("segment_funcdef_name", + StrCat(info.engine_name, "_native_segment")) .Attr("serialized_segment", segment_string) .Attr("calibration_data", "") .Attr("max_cached_engines_count", info.maximum_cached_engines) @@ -536,8 +539,7 @@ Status CreateTRTNode(const ConversionParams& params, } // Function to construct a funcdef from the segment and add it to the graph. -Status ModifyGraphForFunctionDef(Graph* graph, - const GraphDef& segment, +Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, Graph* sgraph) { // sgraph is a graph for the segment, to be modified by this function // graph is the input graph to be optimized by TRT. @@ -546,16 +548,16 @@ Status ModifyGraphForFunctionDef(Graph* graph, std::map io_nodes; int num_inputs = 0; for (auto n : sgraph->op_nodes()) { - if (absl::StartsWith(n->name(), kInputPHName)) { + if (absl::StartsWith(n->name(), prefixes.kInputPHName)) { num_inputs++; io_nodes.insert({n->name(), n}); - } else if (absl::StartsWith(n->name(), kOutputPHName)) { + } else if (absl::StartsWith(n->name(), prefixes.kOutputPHName)) { io_nodes.insert({n->name(), n}); } } for (int i = 0; i < num_inputs; ++i) { - auto name = StrCat(kInputPHName, i); + auto name = StrCat(prefixes.kInputPHName, i); auto node = io_nodes[name]; NodeDef nd; NodeDefBuilder node_builder(StrCat(name, "_Arg"), @@ -582,7 +584,7 @@ Status ModifyGraphForFunctionDef(Graph* graph, } for (int i = 0; i < io_nodes.size() - num_inputs; ++i) { - auto name = StrCat(kOutputPHName, i); + auto name = StrCat(prefixes.kOutputPHName, i); auto node = io_nodes[name]; NodeDef nd; NodeDefBuilder node_builder(StrCat(name, "_Ret"), @@ -694,7 +696,8 @@ std::pair GetDeviceAndAllocator(const ConversionParams& params, // Entry function from optimization pass. Status ConvertAfterShapes(const ConversionParams& params) { // Sanity checks. - if (params.precision_mode != TrtPrecisionMode::INT8 && params.use_calibration) { + if (params.precision_mode != TrtPrecisionMode::INT8 && + params.use_calibration) { return errors::InvalidArgument( "Calibration requires enabling fallback to TF function execution."); } @@ -717,9 +720,8 @@ Status ConvertAfterShapes(const ConversionParams& params) { TrtNodeValidator validator(*params.graph_properties, params.precision_mode, params.use_calibration); TF_RETURN_IF_ERROR(segment::SegmentGraph( - &graph, - std::bind(&TrtNodeValidator::IsTensorRTCandidate, &validator, - std::placeholders::_1), + &graph, std::bind(&TrtNodeValidator::IsTensorRTCandidate, &validator, + std::placeholders::_1), // Input validation is already done by TrtNodeValidator, so we don't // need to check the input edges. [](const Edge* edge) { return true; }, OutputEdgeValidator(), @@ -757,23 +759,22 @@ Status ConvertAfterShapes(const ConversionParams& params) { : EngineInfo::EngineType::TRTStatic); curr_engine.use_calibration = params.use_calibration; curr_engine.maximum_cached_engines = params.max_cached_engines; - Graph sgraph(flib); status = ModifyGraphForFunctionDef(&graph, curr_engine.segment_graph_def, &sgraph); if (!status.ok()) { - LOG(WARNING) << "Failed to modify graph as a function " - << t << ": " << status; + LOG(WARNING) << "Failed to modify graph as a function " << t << ": " + << status; continue; } FunctionDefLibrary fdeflib; - status = RegisterModifiedGraphToFunctionLibrary(&sgraph, &graph, - fdeflib, curr_engine.engine_name); - + status = RegisterModifiedGraphToFunctionLibrary(&sgraph, &graph, fdeflib, + curr_engine.engine_name); + if (!status.ok()) { - LOG(WARNING) << "Failed to register segment graphdef as a function " - << t << ": " << status; + LOG(WARNING) << "Failed to register segment graphdef as a function " << t + << ": " << status; continue; } diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h index 25bcb345ce5..b4f3849a93a 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h @@ -18,8 +18,8 @@ limitations under the License. #include #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" -#include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/framework/function.pb.h" +#include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/grappler/clusters/cluster.h" #include "tensorflow/core/grappler/costs/graph_properties.h" #include "tensorflow/core/lib/core/status.h" @@ -32,6 +32,8 @@ namespace tensorflow { namespace tensorrt { namespace convert { +// extern const IONamePrefixes prefixes; + struct ConversionParams { const GraphDef* input_graph_def = nullptr; const std::vector* output_names = nullptr; @@ -56,8 +58,7 @@ Status ConvertAfterShapes(const ConversionParams& params); std::pair GetDeviceAndAllocator(const ConversionParams& params, const EngineInfo& engine); -Status ModifyGraphForFunctionDef(Graph* graph, - const GraphDef& segment, +Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, Graph* sgraph); Status RegisterModifiedGraphToFunctionLibrary(Graph* sgraph, Graph* graph, diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc index efb186c4c55..784b29470f6 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc @@ -77,18 +77,15 @@ limitations under the License. namespace tensorflow { namespace tensorrt { -// TODO(aaroey): put these constants into some class. -const char* const kInputPHName = "TensorRTInputPH_"; -const char* const kOutputPHName = "TensorRTOutputPH_"; +namespace convert { bool IsEngineInput(absl::string_view name) { - return absl::StartsWith(name, kInputPHName); + return absl::StartsWith(name, prefixes.kInputPHName); } bool IsEngineOutput(absl::string_view name) { - return absl::StartsWith(name, kOutputPHName); + return absl::StartsWith(name, prefixes.kOutputPHName); } -namespace convert { using absl::StrAppend; using absl::StrCat; @@ -364,9 +361,9 @@ string DebugString(const nvinfer1::Permutation& permutation, int len) { string DebugString(const nvinfer1::ITensor& tensor) { return StrCat("nvinfer1::ITensor(@", reinterpret_cast(&tensor), - ", name=", tensor.getName(), - ", dtype=", DebugString(tensor.getType()), - ", dims=", DebugString(tensor.getDimensions()), ")"); + ", name=", tensor.getName(), ", dtype=", + DebugString(tensor.getType()), ", dims=", + DebugString(tensor.getDimensions()), ")"); } Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l, @@ -444,11 +441,10 @@ Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l, for (int i = 0; i < broadcast_num_dims; ++i) { if ((output_l[i] != output_r[i]) && (output_l[i] != 1) && (output_r[i] != 1)) { - return errors::InvalidArgument("Infeasible broadcast scheme (", - "batch_dim: ", output_l[0], ", ", - DebugString(*operand_l_new_dims), " vs ", - "batch_dim: ", output_r[0], ", ", - DebugString(*operand_r_new_dims), ")"); + return errors::InvalidArgument( + "Infeasible broadcast scheme (", "batch_dim: ", output_l[0], ", ", + DebugString(*operand_l_new_dims), " vs ", "batch_dim: ", + output_r[0], ", ", DebugString(*operand_r_new_dims), ")"); } } } @@ -716,8 +712,8 @@ size_t TRT_ShapedWeights::size_bytes() const { string TRT_ShapedWeights::DebugString() const { return StrCat("TRT_ShapedWeights(shape=", convert::DebugString(shape_), - ", type=", convert::DebugString(type_), - ", values=", reinterpret_cast(GetValues()), ")"); + ", type=", convert::DebugString(type_), ", values=", + reinterpret_cast(GetValues()), ")"); } // A fake ITensor implementation used to check whether the TF-TRT converter can @@ -986,10 +982,8 @@ OpConverterParams::OpConverterParams( use_calibration(converter->use_calibration()) {} const std::set* TrtNodeValidator::quantize_ops = new std::set{ - "QuantizeAndDequantizeV2", - "QuantizeAndDequantizeV3", - "FakeQuantWithMinMaxVars", - "FakeQuantWithMinMaxArgs", + "QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3", + "FakeQuantWithMinMaxVars", "FakeQuantWithMinMaxArgs", }; TrtNodeValidator::TrtNodeValidator( @@ -1068,9 +1062,9 @@ Status TrtNodeValidator::IsTensorRTCandidate(const Node* node) { Status status = ConvertToTensorOrWeights(src_def, edge->src_output(), &tensor_or_weights); if (!status.ok()) { - return errors::Internal( - "Failed to convert input ", src_def.name(), - " to a TRT_TensorOrWeights: ", status.error_message()); + return errors::Internal("Failed to convert input ", src_def.name(), + " to a TRT_TensorOrWeights: ", + status.error_message()); } inputs.push_back(tensor_or_weights); } @@ -1369,9 +1363,9 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input, // CreateConstantLayer. So we can treat it as a tensor for // AreDimsStaticWithDifferentSize(). This really only matters for 0-D tensors. if (AreDimsStaticWithDifferentSize(input_dims, dims, /*is_tensor=*/true)) { - return errors::InvalidArgument( - "Incompatible shapes: ", DebugString(input_dims), " vs. ", - DebugString(dims)); + return errors::InvalidArgument("Incompatible shapes: ", + DebugString(input_dims), " vs. ", + DebugString(dims)); } // ConstantLayer requires static shapes (cannot infer -1). if (input.is_weights() && !HasStaticShape(dims)) { @@ -1461,7 +1455,7 @@ void Converter::MaybeApplyQuantizationRanges() { // Infer ranges across marked ops. PropagateQuantizationRanges(); - // Apply ranges. +// Apply ranges. #if IS_TRT_VERSION_GE(5, 0, 0, 0) for (auto pair : quantization_ranges_) { nvinfer1::ITensor* tensor = pair.first; @@ -1516,19 +1510,15 @@ void Converter::MaybeApplyQuantizationRanges() { const std::vector>> fused_patterns = { {"Fused Conv+Bias+Activation", { - IsConvolution, - IsScale, - IsClipOrRelu, + IsConvolution, IsScale, IsClipOrRelu, }}, {"Fused Conv+Bias", { - IsConvolution, - IsScale, + IsConvolution, IsScale, }}, {"Fused Conv+Activation", { - IsConvolution, - IsClipOrRelu, + IsConvolution, IsClipOrRelu, }}, }; for (int i = 0; i < this->network()->getNbLayers(); i++) { @@ -2108,11 +2098,11 @@ Status ConvertReshape(OpConverterParams* params) { << "\nreshape_batch_dim=" << reshape_batch_dim << ", reshape_dims=" << DebugString(reshape_dims); if (reshape_may_change_batch_dim) { - const string msg = StrCat( - "Reshape on batch dimension is not supported, at ", node_def.name(), - ". input_batch_dim=", input_batch_dim, ", ", DebugString(input_dims), - "; reshape_batch_dim=", reshape_batch_dim, ", ", - DebugString(reshape_dims)); + const string msg = + StrCat("Reshape on batch dimension is not supported, at ", + node_def.name(), ". input_batch_dim=", input_batch_dim, ", ", + DebugString(input_dims), "; reshape_batch_dim=", + reshape_batch_dim, ", ", DebugString(reshape_dims)); return errors::Unimplemented(msg); } @@ -2820,7 +2810,7 @@ Status ConvertActivation(OpConverterParams* params) { params->converter->network()->addActivation(*inputs.at(0).tensor(), op_pair->second); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - // Set parameters. +// Set parameters. #if IS_TRT_VERSION_GE(5, 1, 2, 0) if (node_def.op() == "Elu") { layer->setAlpha(1.0f); @@ -4111,8 +4101,8 @@ Status ConvertGather(OpConverterParams* params) { if (trt_gather_output_dims.nbDims != expected_trt_output_rank) { return errors::Internal( "Get unexpected output dimensions of IGatherLayer. Expect nbDims: ", - expected_trt_output_rank, - ", actual nbDims: ", trt_gather_output_dims.nbDims); + expected_trt_output_rank, ", actual nbDims: ", + trt_gather_output_dims.nbDims); } // Reshape the output so after adding the implicit batch dim it'll match the // output shape of TF GatherV2. @@ -4211,8 +4201,9 @@ Status ConvertMatMulHelper(OpConverterParams* params, input_b.GetTrtDims().nbDims == 2; // If int8 is specified, FC must be used unless it is not compatible, as MM // does not support int8 at this time. - if (should_use_fc || (can_use_fc && params->converter->precision_mode() == - TrtPrecisionMode::INT8)) { + if (should_use_fc || + (can_use_fc && + params->converter->precision_mode() == TrtPrecisionMode::INT8)) { return ConvertFullyConnectedHelper( params, input_a.tensor(), input_b.weights(), transpose_b, node_name); } @@ -4228,9 +4219,8 @@ Status ConvertMatMulHelper(OpConverterParams* params, // If the MatMul operand is a constant, applies transposes at conversion-time // as necessary. If the operand is a tensor, does nothing. If required // transposes were applied, sets transpose to false. - const auto prepare_matmul_operand = - [¶ms](TRT_TensorOrWeights operand, - bool* transpose) -> nvinfer1::ITensor* { + const auto prepare_matmul_operand = [¶ms]( + TRT_TensorOrWeights operand, bool* transpose) -> nvinfer1::ITensor* { if (operand.is_tensor()) { return operand.tensor(); } else { @@ -4312,19 +4302,18 @@ Status ConvertBatchMatMul(OpConverterParams* params) { // Tensor with TF Dims: 12 5 3 -> TRT Dims: 5 3 // Weight with TF Dims: 12 3 6 -> TRT Dims: 12 3 6 // It is not possible to treat the weight input as a batched [3, 6] tensor. - const auto check_weight_is_not_batched = - [](const TRT_TensorOrWeights& input_l, - const TRT_TensorOrWeights& input_r) { - // If input_l is a weight, then input_r must be a tensor because - // otherwise the op would be handled by Grappler. - if (input_l.is_weights() && - input_l.GetTrtDims().nbDims > input_r.GetTrtDims().nbDims && - input_l.GetTrtDims().d[0] != 1) { - return errors::Unimplemented( - "TensorRT does not support batched constants."); - } - return Status::OK(); - }; + const auto check_weight_is_not_batched = []( + const TRT_TensorOrWeights& input_l, const TRT_TensorOrWeights& input_r) { + // If input_l is a weight, then input_r must be a tensor because + // otherwise the op would be handled by Grappler. + if (input_l.is_weights() && + input_l.GetTrtDims().nbDims > input_r.GetTrtDims().nbDims && + input_l.GetTrtDims().d[0] != 1) { + return errors::Unimplemented( + "TensorRT does not support batched constants."); + } + return Status::OK(); + }; TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(0), inputs.at(1))); TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(1), inputs.at(0))); @@ -5017,12 +5006,12 @@ Status ConvertGraphDefToEngine( for (const auto& node_def : gdef.node()) { string node_name = node_def.name(); VLOG(2) << "Converting op name=" << node_name << ", op=" << node_def.op(); - if (IsEngineInput(node_name)){ + if (IsEngineInput(node_name)) { int32 slot_number = -1; string type_key; if (node_def.op() == "Placeholder") { if (!strings::safe_strto32( // non-absl ok - node_name.c_str() + strlen(kInputPHName), &slot_number)) { + node_name.c_str() + strlen(prefixes.kInputPHName), &slot_number)) { return errors::InvalidArgument("Failed to parse slot number from ", node_name); } @@ -5033,7 +5022,11 @@ Status ConvertGraphDefToEngine( slot_number = node_def.attr().at("index").i(); type_key = "T"; } else { - return errors::InvalidArgument("Node ", node_name, " with name starting with kInputPHName is neither Placeholder nor Arg, instead ", node_def.op()); + return errors::InvalidArgument("Node ", node_name, + " with name starting with kInputPHName " + "is neither Placeholder nor Arg, " + "instead ", + node_def.op()); } nvinfer1::DataType trt_dtype; nvinfer1::Dims trt_dims; @@ -5060,14 +5053,17 @@ Status ConvertGraphDefToEngine( int32 slot_number = -1; if (node_def.op() == "Identity") { if (!strings::safe_strto32( // non-absl ok - node_name.c_str() + strlen(kOutputPHName), &slot_number)) { + node_name.c_str() + strlen(prefixes.kOutputPHName), &slot_number)) { return errors::InvalidArgument("Failed to parse slot number from ", node_name); } } else if (tensorflow::grappler::IsRetval(node_def)) { slot_number = node_def.attr().at("index").i(); } else { - return errors::InvalidArgument("Node with name ", node_name, " starting with kOutputPHName is neither Identity nor Retval, instead ", node_def.op()); + return errors::InvalidArgument("Node with name ", node_name, + " starting with prefixes.kOutputPHName is " + "neither Identity nor Retval, instead ", + node_def.op()); } // Get output type that TensorFlow expects TFAttrs attrs(node_def); @@ -5136,7 +5132,7 @@ Status ConvertSegmentToGraphDef( // Add dummy input/output nodes to the segment graphdef. if (connection.is_input_edge) { - const string node_name = StrCat(kInputPHName, connection.port_number); + const string node_name = StrCat(prefixes.kInputPHName, connection.port_number); if (marker_nodes.count(node_name)) { VLOG(1) << "Reusing input " << node_name << " for the edge " << connection.outside_node_name << ":" @@ -5155,7 +5151,7 @@ Status ConvertSegmentToGraphDef( << " -> " << connection.inside_node_name << ":" << connection.inside_port; } else { - const string node_name = StrCat(kOutputPHName, connection.port_number); + const string node_name = StrCat(prefixes.kOutputPHName, connection.port_number); if (marker_nodes.count(node_name)) { VLOG(1) << "Reusing output " << node_name << " for the edge " << connection.inside_node_name << ":" << connection.inside_port @@ -5194,7 +5190,7 @@ Status ConvertSegmentToGraphDef( auto snode = segment_def->mutable_node(old_to_new_id_map[connection.inside_id]); const string placeholder_name = - StrCat(kInputPHName, connection.port_number); + StrCat(prefixes.kInputPHName, connection.port_number); VLOG(1) << "Updating " << snode->name() << ":" << connection.inside_port << " from " << snode->input(connection.inside_port) << " to " << placeholder_name; diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h index a6a7afe121e..9dfe8ed3b1d 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h @@ -38,8 +38,6 @@ limitations under the License. namespace tensorflow { namespace tensorrt { -extern const char* const kInputPHName; -extern const char* const kOutputPHName; namespace convert { @@ -51,6 +49,8 @@ namespace convert { (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \ NV_TENSORRT_PATCH == patch && NV_TENSORRT_BUILD >= build)) +extern const IONamePrefixes prefixes = IONamePrefixes(); + struct EngineConnection { // Constructs a non-control edge. EngineConnection(const string& outside, int out_id, int out_port, diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h index 91c8c660f85..981c182311b 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/utils.h +++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h @@ -23,6 +23,14 @@ limitations under the License. namespace tensorflow { namespace tensorrt { +class IONamePrefixes { + public: + static constexpr const char* const kInputPHName = "TensorRTInputPH_"; + static constexpr const char* const kOutputPHName = "TensorRTOutputPH_"; + static constexpr const char* const kInputPHNameLower = "tensorrtinputph_"; + static constexpr const char* const kOutputPHNameLower = "tensorrtoutputph_"; +}; + template struct TrtDestroyer { void operator()(T* t) { diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index 4c1a2127fb3..81efdbb8b94 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -22,10 +22,10 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h" +#include "tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h" -#include "tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/op.h" @@ -55,6 +55,9 @@ using ::stream_executor::port::StatusOr; // A helper class to call done() when destructed for asynchronous execution. // Helps simultaneous execution of native and TRT engines. + +auto prefixes = IONamePrefixes(); + class AsyncHelper : public core::RefCounted { public: AsyncHelper(AsyncOpKernel::DoneCallback done) : done_(done) {} @@ -235,16 +238,7 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) OP_REQUIRES_OK(context, context->GetAttr("workspace_size_bytes", &workspace_size_)); OP_REQUIRES_OK(context, context->GetAttr("static_engine", &static_engine_)); - /*if (!static_engine_) { - OP_REQUIRES(context, segment_graph_.ParseFromString(serialized_segment_), - errors::InvalidArgument("Failed to parse segment graphdef!")); - VLOG(1) << "Size of serialized GraphDef: " - << serialized_segment_.capacity(); - string tmp; - // Swap with temporary empty string to deallocate the CPU memory. - serialized_segment_.swap(tmp); - }*/ - + VLOG(1) << "Constructing " << name(); string precision_string; OP_REQUIRES_OK(context, @@ -262,8 +256,9 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) if (!static_engine_) { OP_REQUIRES_OK(context, ConstructFunctionHandle(context)); FunctionLibraryRuntime* lib = context->function_library(); - OP_REQUIRES_OK(context, FunctionDefToGraphDef(native_func_, lib, &segment_graph_, - &input_node_ids_, &output_node_ids_)); + OP_REQUIRES_OK(context, + FunctionDefToGraphDef(native_func_, lib, &segment_graph_, + &input_node_ids_, &output_node_ids_)); } calibration_mode_ = (use_calibration_ && precision_mode_ == TrtPrecisionMode::INT8 && @@ -316,13 +311,12 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx, core::ScopedUnref sc(helper); TRTCalibrationResource* calib_res = nullptr; OP_REQUIRES_OK_ASYNC( - ctx, - ctx->resource_manager()->LookupOrCreate( - std::string(kCalibrationContainerName), name(), - reinterpret_cast(&calib_res), - {[ctx, this](TRTCalibrationResource** cr) -> Status { - return this->AllocateCalibrationResources(ctx, cr); - }}), + ctx, ctx->resource_manager()->LookupOrCreate( + std::string(kCalibrationContainerName), name(), + reinterpret_cast(&calib_res), + {[ctx, this](TRTCalibrationResource** cr) -> Status { + return this->AllocateCalibrationResources(ctx, cr); + }}), *helper); core::ScopedUnref calib_sc(calib_res); int num_inputs = ctx->num_inputs(); @@ -340,9 +334,9 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx, const auto device_tensor = calib_res->device_tensors_.at(i).AccessTensor(ctx); CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes()); - input_data.emplace(StrCat(kInputPHName, - static_engine_ ? i : input_node_ids_[i]), - data_address); + input_data.emplace( + StrCat(prefixes.kInputPHName, static_engine_ ? i : input_node_ids_[i]), + data_address); } VLOG(2) << "Filled map for sending"; // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files @@ -421,9 +415,9 @@ Status TRTEngineOp::GetEngineInputShapes( // This should not happen, but just for safety. if (actual_input_shapes.size() != cached_input_shapes.size()) { return errors::InvalidArgument( - "Input shape list size mismatch for ", name(), - ", cached size: ", cached_input_shapes.size(), - " vs. actual size: ", actual_input_shapes.size()); + "Input shape list size mismatch for ", name(), ", cached size: ", + cached_input_shapes.size(), " vs. actual size: ", + actual_input_shapes.size()); } if (match_shapes(actual_input_shapes, cached_input_shapes)) { const int cached_batch_size = cached_input_shapes[0].dim_size(0); @@ -483,7 +477,8 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx, std::vector buffers(num_binding); for (int i = 0; i < ctx->num_inputs(); i++) { - const string input_name = StrCat(kInputPHName, static_engine_ ? i : input_node_ids_[i]); + const string input_name = + StrCat(prefixes.kInputPHName, static_engine_ ? i : input_node_ids_[i]); const int binding_index = cuda_engine->getBindingIndex(input_name.c_str()); if (binding_index == -1) { const string msg = @@ -525,7 +520,8 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx, for (int i = 0; i < ctx->num_outputs(); i++) { // Create an output tensor - const string output_name = StrCat(kOutputPHName, static_engine_ ? i : output_node_ids_[i]); + const string output_name = StrCat(prefixes.kOutputPHName, + static_engine_ ? i : output_node_ids_[i]); const int binding_index = cuda_engine->getBindingIndex(output_name.c_str()); Tensor* output_tensor = nullptr; @@ -764,7 +760,7 @@ Status TRTEngineOp::AllocateCalibrationResources(OpKernelContext* ctx, "Unsupported data type encountered in input ", i); } cres->device_buffers_.emplace( - StrCat(kInputPHName, static_engine_ ? i : input_node_ids_[i]), + StrCat(prefixes.kInputPHName, static_engine_ ? i : input_node_ids_[i]), std::pair(device_address, device_tensor->TotalBytes())); } cres->calibrator_.reset( diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc index dc31e5c156e..4eef454f8f3 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc @@ -61,8 +61,6 @@ class TRTEngineOpTestBase : public OpsTestBase { // Serialize the graph. TRTEngineOp will convert it using dynamic mode. GraphDef graph_def; TF_ASSERT_OK(s.ToGraphDef(&graph_def)); - /* - */ const string func_name = "myop_native_segment"; Graph* graph = s.graph(); Graph sgraph(graph->flib_def()); @@ -70,30 +68,17 @@ class TRTEngineOpTestBase : public OpsTestBase { graph, graph_def, &sgraph)); TF_ASSERT_OK(convert::RegisterModifiedGraphToFunctionLibrary(&sgraph, graph, flib_def_->ToProto(), "myop")); - //TF_ASSERT_OK(convert::RegisterSegmentFunctionToFunctionLibrary(graph, graph_def, "myop")); - - //FunctionDefLibrary fdeflib; - //auto native_segment = fdeflib.add_function(); - - //GraphToFunctionDef(*graph, func_name, native_segment); - /*(*native_segment - ->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr] - .set_b(true); - */ - - //graph->AddFunctionLibrary(fdeflib); PartialTensorShape shape({-1, -1}); - // Create the op. OpsTestBase::SetDevice(DEVICE_GPU, std::move(device)); TF_ASSERT_OK(NodeDefBuilder("myop", "TRTEngineOp") .Input(FakeInput(1, dtype)) .Attr("input_shapes", {shape}) .Attr("output_shapes", {shape}) .Attr("static_engine", false) - .Attr("segment_funcdef_name", func_name) // no native fallback - .Attr("serialized_segment", "")//graph_def.SerializeAsString()) + .Attr("segment_funcdef_name", func_name) + .Attr("serialized_segment", "") .Attr("calibration_data", "") .Attr("max_cached_engines_count", max_cached_engines_count) .Attr("workspace_size_bytes", 1 << 20) diff --git a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc index af76d84b232..13457ba5fd2 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc +++ b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc @@ -14,37 +14,32 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h" -//#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" -#include "tensorflow/core/common_runtime/graph_optimizer.h" +#include "absl/strings/ascii.h" +#include "absl/strings/str_cat.h" #include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/common_runtime/graph_optimizer.h" #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/platform/logging.h" -#include "absl/strings/str_cat.h" -#include "absl/strings/ascii.h" namespace tensorflow { namespace tensorrt { -const char* const kInputPHName = "TensorRTInputPH_"; -const char* const kOutputPHName = "TensorRTOutputPH_"; -const char* const kInputPHNameLower = "tensorrtinputph_"; -const char* const kOutputPHNameLower = "tensorrtoutputph_"; +auto prefixes = IONamePrefixes(); -string NewNameWithIOPrefix(const Node* n) { - if (absl::StartsWith(n->name(), kInputPHNameLower)){ - return strings::StrCat(kInputPHName, n->id()); - } - else if (absl::StartsWith(n->name(), kOutputPHNameLower)) { - return strings::StrCat(kOutputPHName, n->id()); +string AppendIdToNodeName(const Node* n) { + if (absl::StartsWith(n->name(), prefixes.kInputPHNameLower)) { + return strings::StrCat(prefixes.kInputPHName, n->id()); + } else if (absl::StartsWith(n->name(), prefixes.kOutputPHNameLower)) { + return strings::StrCat(prefixes.kOutputPHName, n->id()); } return strings::StrCat("n", n->id()); } void ToGraphDefWithIOPrefix(const Graph* g, GraphDef* gdef) { // This is the same function as in function.cc. However, it uses the - // NewName mapping above, which retains IO prefixes (kInputPHName etc) + // name mapping above, which retains IO prefixes (prefixes.kInputPHName etc) gtl::InlinedVector inputs; gdef->Clear(); *gdef->mutable_versions() = g->versions(); @@ -59,7 +54,7 @@ void ToGraphDefWithIOPrefix(const Graph* g, GraphDef* gdef) { ReverseDFSFrom(*g, start_nodes, nullptr, [gdef, &inputs](Node* n) { if (!n->IsOp()) return; NodeDef* ndef = gdef->add_node(); - ndef->set_name(NewNameWithIOPrefix(n)); + ndef->set_name(AppendIdToNodeName(n)); ndef->set_op(n->type_string()); for (const auto& attr : n->attrs()) { (*ndef->mutable_attr())[attr.first] = attr.second; @@ -93,7 +88,7 @@ void ToGraphDefWithIOPrefix(const Graph* g, GraphDef* gdef) { ndef->add_input("unknown"); continue; } - const string srcname = NewNameWithIOPrefix(e->src()); + const string srcname = AppendIdToNodeName(e->src()); if (!e->src()->IsOp()) { } else if (e->IsControlEdge()) { ndef->add_input(strings::StrCat("^", srcname)); @@ -108,52 +103,33 @@ void ToGraphDefWithIOPrefix(const Graph* g, GraphDef* gdef) { Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, FunctionLibraryRuntime* flib_runtime, - GraphDef* graph_def, + GraphDef* graph_def, std::vector* input_node_ids, std::vector* output_node_ids) { - const FunctionLibraryDefinition* flib_def = flib_runtime->GetFunctionLibraryDefinition(); + const FunctionLibraryDefinition* flib_def = + flib_runtime->GetFunctionLibraryDefinition(); const FunctionBody* fbody; fbody = flib_runtime->GetFunctionBody(handle); - //TF_RET_CHECK(*fbody) + if (!fbody) { + return errors::Internal( + "Function body is null when converting from FuncDef to GraphDef."); + } std::unique_ptr graph(new Graph(flib_def)); - + CopyGraph(*fbody->graph, graph.get()); - // Copied from compiler/xla/compile_xla.cc : - /* - OptimizerOptions opts; - opts.set_opt_level(OptimizerOptions::L0); - opts.set_do_common_subexpression_elimination(false); - opts.set_do_function_inlining(true); - opts.set_do_constant_folding(true); - GraphOptimizer optimizer(opts); - auto cf_consider_fn = [](const Node* n) { - for (const auto& output_arg : n->op_def().output_arg()) { - if (output_arg.type() == DT_VARIANT) { - return false; - } - } - return true; - }; - GraphOptimizer::Options graph_optimizer_options; - graph_optimizer_options.cf_consider_fn = cf_consider_fn; - - */ - //optimizer.Optimize(flib_runtime, flib_runtime->env(), - // /*device=*/nullptr, &graph, graph_optimizer_options); - for (Node* n : graph->nodes()) { auto id = n->id(); if (n->IsArg()) { - VLOG(1) << "Arg Node id " << id; + VLOG(2) << "Arg Node id used for unique naming is " << id; input_node_ids->push_back(id); } if (n->IsRetval()) { - VLOG(1) << "Retval Node id " << id; + VLOG(2) << "Retval Node id used for unique naming is " << id; output_node_ids->push_back(id); } } - + ToGraphDefWithIOPrefix(graph.release(), graph_def); for (const auto node_def : graph_def->node()) { @@ -161,8 +137,6 @@ Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, } return Status::OK(); - -} - +} } } diff --git a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h index ffc702679e0..6acc21242a1 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h +++ b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_FUNCDEF_TO_GRAPHDEF_H_ #define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_FUNCDEF_TO_GRAPHDEF_H_ +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/framework/function.h" @@ -26,16 +27,18 @@ namespace tensorflow { namespace tensorrt { -string NewNameWithIOPrefix(const Node* n); +string AppendIdToNodeName(const Node* n); + void ToGraphDefWithIOPrefix(const Graph* g, GraphDef* gdef); + Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, FunctionLibraryRuntime* flib_runtime, GraphDef* graph_def, - std::vector* input_node_ids, - std::vector* output_node_ids); + std::vector* input_node_ids, + std::vector* output_node_ids); -} // namespace tensorrt -} // namespace tensorflow +} // namespace tensorrt +} // namespace tensorflow #endif #endif diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py index a41f965573a..6627c3788a4 100644 --- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py +++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py @@ -562,9 +562,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): self.assertNotEmpty(segment_funcdef_name, node.name) self.assertIn(function_name, functions) else: - #self.assertEmpty(segment_funcdef_name, node.name) self.assertTrue(len(node.attr["serialized_segment"].s), node.name) - #self.assertNotIn(function_name, functions) self.assertIn(node.name, expected_engines) self.assertEqual( self._ToBytes(run_params.precision_mode), diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py index cdd24ce041e..b8376a5ca65 100644 --- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py +++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py @@ -449,8 +449,6 @@ class TrtConvertTest(test_util.TensorFlowTestCase): except errors.OpError as e: # This should happen only when fallback path is disabled and TRT engine # fails to run. - # TODO(phillip-kravtsov) Check what correct handling is - #self.assertTrue(not use_function_backup and not expect_engine_is_run) self.assertIn("Fallback path is disabled, for TRTEngineOp_0", str(e)) @test_util.deprecated_graph_mode_only From 5e7b18c892dad02cab0663471c2df340b21a7ea0 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Tue, 16 Jul 2019 20:45:50 -0700 Subject: [PATCH 0112/3053] Removed duplicate function in trt_engine_op.cc --- .../tf2tensorrt/kernels/trt_engine_op.cc | 41 +++++-------------- 1 file changed, 11 insertions(+), 30 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index 81efdbb8b94..e49a7e9b104 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -95,9 +95,9 @@ class TRTEngineOp : public AsyncOpKernel { // Construct a function handle for executing native funcdef graph // These are the exact same function. - Status ConstructFunctionHandle(OpKernelContext* ctx); - Status ConstructFunctionHandle(OpKernelConstruction* ctx); + Status ConstructFunctionHandle(FunctionLibraryRuntime* lib, + const string& device_name); // Execute replaced native segment as function Op. void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper); @@ -188,9 +188,10 @@ void* GetTensorAddress(const Tensor* tensor_ptr) { } } -Status TRTEngineOp::ConstructFunctionHandle(OpKernelContext* ctx) { +Status TRTEngineOp::ConstructFunctionHandle(FunctionLibraryRuntime* lib, + const string& device_name) { VLOG(1) << "Constructing function handle"; - auto lib = ctx->function_library(); + // auto lib = ctx->function_library(); if (lib == nullptr) { return errors::Internal("Context function library is null"); } @@ -201,30 +202,7 @@ Status TRTEngineOp::ConstructFunctionHandle(OpKernelContext* ctx) { } FunctionLibraryRuntime::InstantiateOptions inst_ops; inst_ops.state_handle = ""; - inst_ops.target = ctx->device()->name(); - native_func_ = 0; - return lib->Instantiate(funcdef_name_, AttrSlice(&fdef->attr()), inst_ops, - &native_func_); -} - -Status TRTEngineOp::ConstructFunctionHandle(OpKernelConstruction* ctx) { - VLOG(1) << "Constructing function handle"; - auto lib = ctx->function_library(); - if (lib == nullptr) { - return errors::Internal("Context function library is null"); - } - auto func_names = lib->GetFunctionLibraryDefinition()->ListFunctionNames(); - for (auto func_name : func_names) { - VLOG(2) << "Func name: " << func_name; - } - auto fdef = lib->GetFunctionLibraryDefinition()->Find(funcdef_name_); - if (fdef == nullptr) { - return errors::Internal("Native FunctionDef ", funcdef_name_, - " can't be found in function library"); - } - FunctionLibraryRuntime::InstantiateOptions inst_ops; - inst_ops.state_handle = ""; - inst_ops.target = ctx->device()->name(); + inst_ops.target = device_name; native_func_ = 0; return lib->Instantiate(funcdef_name_, AttrSlice(&fdef->attr()), inst_ops, &native_func_); @@ -254,7 +232,8 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) context->GetAttr("use_calibration", &use_calibration_)); native_func_ = kInvalidHandle; if (!static_engine_) { - OP_REQUIRES_OK(context, ConstructFunctionHandle(context)); + OP_REQUIRES_OK(context, ConstructFunctionHandle(context->function_library(), + context->device()->name())); FunctionLibraryRuntime* lib = context->function_library(); OP_REQUIRES_OK(context, FunctionDefToGraphDef(native_func_, lib, &segment_graph_, @@ -279,7 +258,9 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx, std::vector inputs; std::vector* outputs = new std::vector(); if (native_func_ == kInvalidHandle) { - OP_REQUIRES_OK_ASYNC(ctx, ConstructFunctionHandle(ctx), *helper); + OP_REQUIRES_OK_ASYNC(ctx, ConstructFunctionHandle(ctx->function_library(), + ctx->device()->name()), + *helper); } auto lib = ctx->function_library(); FunctionLibraryRuntime::Options opts; From 23491b52002bd85c7b0b9e5a4b79382dd8dbd5d3 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Thu, 11 Jul 2019 18:35:46 -0700 Subject: [PATCH 0113/3053] Inital commit: removed serialized string from dynamic TRT engine. --- tensorflow/compiler/tf2tensorrt/BUILD | 7 + .../tf2tensorrt/convert/convert_graph.cc | 81 +++++---- .../tf2tensorrt/convert/convert_graph.h | 12 ++ .../tf2tensorrt/convert/convert_nodes.cc | 40 ++-- .../tf2tensorrt/kernels/trt_engine_op.cc | 74 +++++++- .../tf2tensorrt/kernels/trt_engine_op_test.cc | 35 +++- .../tf2tensorrt/utils/funcdef_to_graphdef.cc | 172 ++++++++++++++++++ .../tf2tensorrt/utils/funcdef_to_graphdef.h | 42 +++++ .../test/tf_trt_integration_test_base.py | 10 +- 9 files changed, 415 insertions(+), 58 deletions(-) create mode 100644 tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc create mode 100644 tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD index bfaae215709..bca101c4a53 100644 --- a/tensorflow/compiler/tf2tensorrt/BUILD +++ b/tensorflow/compiler/tf2tensorrt/BUILD @@ -168,6 +168,7 @@ tf_cuda_cc_test( ":trt_op_kernels", ":trt_op_libs", ":trt_resources", + ":trt_conversion", "@com_google_googletest//:gtest", "//tensorflow/cc:cc_ops", "//tensorflow/cc:ops", @@ -238,11 +239,13 @@ tf_cuda_library( "utils/calibration_resource.cc", "utils/trt_int8_calibrator.cc", "utils/trt_lru_cache.cc", + "utils/funcdef_to_graphdef.cc", ], hdrs = [ "utils/calibration_resource.h", "utils/trt_int8_calibrator.h", "utils/trt_lru_cache.h", + "utils/funcdef_to_graphdef.h", ], deps = [ ":trt_allocator", @@ -250,6 +253,10 @@ tf_cuda_library( ":utils", "//tensorflow/core:framework_headers_lib", "//tensorflow/core:framework_lite", + #"//tensorflow/core:framework", + "//tensorflow/core/grappler:op_types", + "//tensorflow/core:graph", + "//tensorflow/core:gpu_runtime", "//tensorflow/core:lib_proto_parsing", ] + if_tensorrt([":tensorrt_lib"]), ) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index fb5dda9953e..0c2831df275 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -135,6 +135,7 @@ Status GetEngineInfo(const Graph* g, DeviceNameUtils::ParsedName parsed_name; const bool parse_succeeded = DeviceNameUtils::ParseFullName(node_device, &parsed_name); + VLOG(0) << node_device; if (!parse_succeeded || (parse_succeeded && parsed_name.type == "CPU")) { string msg; if (!parse_succeeded) { @@ -441,7 +442,8 @@ Status CreateTRTNode(const ConversionParams& params, segment_string = string(static_cast(engine_data->data()), engine_data->size()); } else { - segment_string = info.segment_graph_def.SerializeAsString(); + //segment_string = info.segment_graph_def.SerializeAsString(); + segment_string = ""; } string prec_string; @@ -461,15 +463,13 @@ Status CreateTRTNode(const ConversionParams& params, } NodeDef trt_node; + //TODO(phillip-kravtsov): use_function_backup: fix this Status status = node_builder.Attr("input_shapes", input_shape_protos) .Attr("output_shapes", output_shape_protos) .Attr("static_engine", info.engine_type == EngineInfo::EngineType::TRTStatic) - .Attr("segment_funcdef_name", - params.use_function_backup - ? StrCat(info.engine_name, "_native_segment") - : "") + .Attr("segment_funcdef_name", StrCat(info.engine_name, "_native_segment")) .Attr("serialized_segment", segment_string) .Attr("calibration_data", "") .Attr("max_cached_engines_count", info.maximum_cached_engines) @@ -539,15 +539,15 @@ Status CreateTRTNode(const ConversionParams& params, } // Function to construct a funcdef from the segment and add it to the graph. -Status RegisterSegmentFunctionToFunctionLibrary(Graph* graph, - const GraphDef& segment, - const string& engine_name) { - Graph sgraph(graph->flib_def()); +Status ModifyGraphForFunctionDef(Graph* graph, + const GraphDef& segment, + Graph* sgraph) { + //Graph sgraph(graph->flib_def()); GraphConstructorOptions gcopts; - TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(gcopts, segment, &sgraph)); + TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(gcopts, segment, sgraph)); std::map io_nodes; int num_inputs = 0; - for (auto n : sgraph.op_nodes()) { + for (auto n : sgraph->op_nodes()) { if (absl::StartsWith(n->name(), kInputPHName)) { num_inputs++; io_nodes.insert({n->name(), n}); @@ -567,12 +567,12 @@ Status RegisterSegmentFunctionToFunctionLibrary(Graph* graph, .Attr("index", i) .Finalize(&nd)); Status s; - auto node_arg = sgraph.AddNode(nd, &s); + auto node_arg = sgraph->AddNode(nd, &s); if (!s.ok()) { LOG(ERROR) << "Couldn't add _Arg node for " << name; } for (auto edge : node->out_edges()) { - sgraph.AddEdge(node_arg, 0, edge->dst(), edge->dst_input()); + sgraph->AddEdge(node_arg, 0, edge->dst(), edge->dst_input()); VLOG(1) << "Updating funcdef input " << node_arg->name() << ":" << 0 << " - > " << edge->dst()->name() << ":" << edge->dst_input(); if (!s.ok()) { @@ -580,7 +580,7 @@ Status RegisterSegmentFunctionToFunctionLibrary(Graph* graph, << " to " << edge->dst()->name() << ":" << edge->dst_input(); } } - sgraph.RemoveNode(node); + sgraph->RemoveNode(node); } for (int i = 0; i < io_nodes.size() - num_inputs; ++i) { @@ -604,34 +604,40 @@ Status RegisterSegmentFunctionToFunctionLibrary(Graph* graph, VLOG(3) << nd.DebugString(); } Status s; - auto node_ret = sgraph.AddNode(nd, &s); + auto node_ret = sgraph->AddNode(nd, &s); if (!s.ok()) { LOG(ERROR) << "Couldn't add _Ret node for " << name; } VLOG(1) << "Update edge from " << edge->src()->name() << ":" << edge->src_output() << " - > " << node_ret->name() << ":" << 0; - sgraph.AddEdge(edge->src(), edge->src_output(), node_ret, 0); - s = sgraph.UpdateEdge(edge->src(), edge->src_output(), node_ret, 0); + sgraph->AddEdge(edge->src(), edge->src_output(), node_ret, 0); + s = sgraph->UpdateEdge(edge->src(), edge->src_output(), node_ret, 0); if (!s.ok()) { LOG(ERROR) << "Failed to update edge from " << edge->src()->name() << ":" << edge->src_output() << " - > " << node_ret->name() << ":" << 0; } - sgraph.RemoveNode(node); + sgraph->RemoveNode(node); } - FunctionDefLibrary fdeflib; + return Status::OK(); +} + +Status RegisterModifiedGraphToFunctionLibrary(Graph* sgraph, Graph* graph, + FunctionDefLibrary fdeflib, + const string& engine_name) { auto native_segment = fdeflib.add_function(); TF_RETURN_IF_ERROR(GraphToFunctionDef( - sgraph, StrCat(engine_name, "_native_segment"), native_segment)); + *sgraph, StrCat(engine_name, "_native_segment"), native_segment)); // Set kIntsonDeviceAttr to true so that all TRTEngineOp outputs are always on // a GPU device as expected. Otherwise, some of the tensors of type DT_INT32 // would be on host if the op generating the tensor has host memory tag set. (*native_segment ->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr] .set_b(true); - if (VLOG_IS_ON(7)) { - VLOG(7) << engine_name << " Function_Def "; - VLOG(7) << native_segment->DebugString(); + //TODO(phillip-kravtsov): set this back to 7 + if (VLOG_IS_ON(0)) { + VLOG(0) << engine_name << " Function_Def "; + VLOG(0) << native_segment->DebugString(); } VLOG(1) << "Adding funcdef to graphlib"; TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdeflib)); @@ -761,14 +767,24 @@ Status ConvertAfterShapes(const ConversionParams& params) { : EngineInfo::EngineType::TRTStatic); curr_engine.use_calibration = params.use_calibration; curr_engine.maximum_cached_engines = params.max_cached_engines; - if (params.use_function_backup) { - status = RegisterSegmentFunctionToFunctionLibrary( - &graph, curr_engine.segment_graph_def, curr_engine.engine_name); - if (!status.ok()) { - LOG(WARNING) << "Failed to register segment graphdef as a function " - << t << ": " << status; - continue; - } + + + Graph sgraph(flib); + status = ModifyGraphForFunctionDef(&graph, curr_engine.segment_graph_def, + &sgraph); + if (!status.ok()) { + LOG(WARNING) << "Failed to modify graph as a function " + << t << ": " << status; + continue; + } + FunctionDefLibrary fdeflib; + status = RegisterModifiedGraphToFunctionLibrary(&sgraph, &graph, + fdeflib, curr_engine.engine_name); + + if (!status.ok()) { + LOG(WARNING) << "Failed to register segment graphdef as a function " + << t << ": " << status; + continue; } engine_bytes_size.push_back(curr_engine.segment_graph_def.ByteSizeLong()); @@ -777,7 +793,8 @@ Status ConvertAfterShapes(const ConversionParams& params) { engine_segments.push_back(std::move(curr_engine)); converted_segments.push_back(std::move(curr_segment)); - if (VLOG_IS_ON(8)) { + if (VLOG_IS_ON(8) && + curr_engine.engine_type == EngineInfo::EngineType::TRTStatic) { string fname = engine_segments.back().engine_name; StrAppend(&fname, ".pb"); std::fstream f; diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h index d7f1df5a102..74135e56cf4 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" #include "tensorflow/core/framework/graph.pb.h" +#include "tensorflow/core/framework/function.pb.h" #include "tensorflow/core/grappler/clusters/cluster.h" #include "tensorflow/core/grappler/costs/graph_properties.h" #include "tensorflow/core/lib/core/status.h" @@ -57,6 +58,17 @@ Status ConvertAfterShapes(const ConversionParams& params); std::pair GetDeviceAndAllocator(const ConversionParams& params, const EngineInfo& engine); +/*Status RegisterSegmentFunctionToFunctionLibrary(Graph* graph, + const GraphDef& segment, + const string& engine_name); + */ +Status ModifyGraphForFunctionDef(Graph* graph, + const GraphDef& segment, + Graph* sgraph); + +Status RegisterModifiedGraphToFunctionLibrary(Graph* sgraph, Graph* graph, + FunctionDefLibrary fdeflib, + const string& engine_name); } // namespace convert } // namespace tensorrt } // namespace tensorflow diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc index c34f85e61a8..efb186c4c55 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc @@ -40,6 +40,7 @@ limitations under the License. #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/grappler/op_types.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/strings/numbers.h" @@ -5016,19 +5017,30 @@ Status ConvertGraphDefToEngine( for (const auto& node_def : gdef.node()) { string node_name = node_def.name(); VLOG(2) << "Converting op name=" << node_name << ", op=" << node_def.op(); - if (IsEngineInput(node_name) && (node_def.op() == "Placeholder")) { + if (IsEngineInput(node_name)){ int32 slot_number = -1; - if (!strings::safe_strto32( // non-absl ok - node_name.c_str() + strlen(kInputPHName), &slot_number)) { - return errors::InvalidArgument("Failed to parse slot number from ", - node_name); + string type_key; + if (node_def.op() == "Placeholder") { + if (!strings::safe_strto32( // non-absl ok + node_name.c_str() + strlen(kInputPHName), &slot_number)) { + return errors::InvalidArgument("Failed to parse slot number from ", + node_name); + } + type_key = "dtype"; + } else if (tensorflow::grappler::IsArg(node_def)) { + // Maybe remove the dependence on grappler and re-implement IsArg, + // which is pretty simple (but could change if new Arg nodes are added) + slot_number = node_def.attr().at("index").i(); + type_key = "T"; + } else { + return errors::InvalidArgument("Node ", node_name, " with name starting with kInputPHName is neither Placeholder nor Arg, instead ", node_def.op()); } nvinfer1::DataType trt_dtype; nvinfer1::Dims trt_dims; int batch_size = -1; auto shape = input_shapes.at(slot_number); auto status = ValidateTensorProperties( - node_def.op(), node_def.attr().at("dtype").type(), shape, + node_def.op(), node_def.attr().at(type_key).type(), shape, /*validation_only=*/false, &trt_dtype, &trt_dims, &batch_size); if (!status.ok()) { const string error_message = @@ -5044,12 +5056,18 @@ Status ConvertGraphDefToEngine( // engines offline, by calling sess.run() and cache/serialize the engines. TF_RETURN_IF_ERROR( converter.AddInputTensor(node_name, trt_dtype, trt_dims, batch_size)); - } else if (IsEngineOutput(node_name) && (node_def.op() == "Identity")) { + } else if (IsEngineOutput(node_name)) { int32 slot_number = -1; - if (!strings::safe_strto32( // non-absl ok - node_name.c_str() + strlen(kOutputPHName), &slot_number)) { - return errors::InvalidArgument("Failed to parse slot number from ", - node_name); + if (node_def.op() == "Identity") { + if (!strings::safe_strto32( // non-absl ok + node_name.c_str() + strlen(kOutputPHName), &slot_number)) { + return errors::InvalidArgument("Failed to parse slot number from ", + node_name); + } + } else if (tensorflow::grappler::IsRetval(node_def)) { + slot_number = node_def.attr().at("index").i(); + } else { + return errors::InvalidArgument("Node with name ", node_name, " starting with kOutputPHName is neither Identity nor Retval, instead ", node_def.op()); } // Get output type that TensorFlow expects TFAttrs attrs(node_def); diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index ab0b21edc41..2b569d177e1 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h" +#include "tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/op.h" @@ -90,8 +91,11 @@ class TRTEngineOp : public AsyncOpKernel { void ExecuteCalibration(OpKernelContext* ctx, AsyncHelper* helper); // Construct a function handle for executing native funcdef graph + // These are the exact same function. Status ConstructFunctionHandle(OpKernelContext* ctx); + Status ConstructFunctionHandle(OpKernelConstruction* ctx); + // Execute replaced native segment as function Op. void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper); @@ -124,6 +128,12 @@ class TRTEngineOp : public AsyncOpKernel { std::vector input_nodes_; std::vector output_nodes_; + // The id's in these vectors are used for getting slot numbers and + // node names after they are uniquified in graph->graphdef conversion. + + std::vector input_node_ids_; + std::vector output_node_ids_; + // serialized protobuf segment or trt engine depending on static_engine_ flag. string serialized_segment_; @@ -198,6 +208,29 @@ Status TRTEngineOp::ConstructFunctionHandle(OpKernelContext* ctx) { &native_func_); } +Status TRTEngineOp::ConstructFunctionHandle(OpKernelConstruction* ctx) { + VLOG(1) << "Constructing function handle"; + auto lib = ctx->function_library(); + if (lib == nullptr) { + return errors::Internal("Context function library is null"); + } + auto func_names = lib->GetFunctionLibraryDefinition()->ListFunctionNames(); + for (auto func_name : func_names) { + VLOG(0) << "Func name: " << func_name; + } + auto fdef = lib->GetFunctionLibraryDefinition()->Find(funcdef_name_); + if (fdef == nullptr) { + return errors::Internal("Native FunctionDef ", funcdef_name_, + " can't be found in function library"); + } + FunctionLibraryRuntime::InstantiateOptions inst_ops; + inst_ops.state_handle = ""; + inst_ops.target = ctx->device()->name(); + native_func_ = 0; + return lib->Instantiate(funcdef_name_, AttrSlice(&fdef->attr()), inst_ops, + &native_func_); +} + TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : AsyncOpKernel(context) { // read serialized_engine @@ -206,7 +239,7 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) OP_REQUIRES_OK(context, context->GetAttr("workspace_size_bytes", &workspace_size_)); OP_REQUIRES_OK(context, context->GetAttr("static_engine", &static_engine_)); - if (!static_engine_) { + /*if (!static_engine_) { OP_REQUIRES(context, segment_graph_.ParseFromString(serialized_segment_), errors::InvalidArgument("Failed to parse segment graphdef!")); VLOG(1) << "Size of serialized GraphDef: " @@ -214,7 +247,8 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) string tmp; // Swap with temporary empty string to deallocate the CPU memory. serialized_segment_.swap(tmp); - } + }*/ + VLOG(1) << "Constructing " << name(); string precision_string; OP_REQUIRES_OK(context, @@ -228,6 +262,25 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) TrtPrecisionModeFromName(precision_string, &precision_mode_)); OP_REQUIRES_OK(context, context->GetAttr("use_calibration", &use_calibration_)); + native_func_ = kInvalidHandle; + if (!static_engine_) { + //TODO(phillip-kravtsov) error checking here: how? + VLOG(0) << "Funcdef_name: " << funcdef_name_; + VLOG(0) << "Static Engine? " << static_engine_; + Status status = ConstructFunctionHandle(context); + VLOG(0) << "Status: " << status; + FunctionLibraryRuntime* lib = context->function_library(); + VLOG(0) << "Funcdef to graphdef"; + FunctionDefToGraphDef(native_func_, lib, &segment_graph_, + &input_node_ids_, &output_node_ids_); + for (int id : input_node_ids_) { + VLOG(0) << "Input node id: " << id << " from engine " << name(); + } + for (int id : output_node_ids_) { + VLOG(0) << "Output node id: " << id << " from engine " << name(); + } + + } calibration_mode_ = (use_calibration_ && precision_mode_ == TrtPrecisionMode::INT8 && calibration_data.empty()); @@ -235,7 +288,6 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) calibrator_.reset(new TRTInt8Calibrator(calibration_data)); calibration_data.resize(0); } - native_func_ = kInvalidHandle; OP_REQUIRES_OK(context, context->GetAttr("max_cached_engines_count", &max_cached_engines_)); } @@ -309,7 +361,9 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx, const auto device_tensor = calib_res->device_tensors_.at(i).AccessTensor(ctx); CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes()); - input_data.emplace(StrCat(kInputPHName, i), data_address); + input_data.emplace(StrCat(kInputPHName, + static_engine_ ? i : input_node_ids_[i]), + data_address); } VLOG(2) << "Filled map for sending"; // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files @@ -446,9 +500,15 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx, // input. const int num_batch = ctx->input(0).shape().dim_size(0); const int num_binding = ctx->num_inputs() + ctx->num_outputs(); + for (int i = 0; i < num_binding; i++) { + auto binding_name = cuda_engine->getBindingName(i); + VLOG(0) << "Binding name for index " << i << " " << binding_name; + } + std::vector buffers(num_binding); + for (int i = 0; i < ctx->num_inputs(); i++) { - const string input_name = StrCat(kInputPHName, i); + const string input_name = StrCat(kInputPHName, static_engine_ ? i : input_node_ids_[i]); const int binding_index = cuda_engine->getBindingIndex(input_name.c_str()); if (binding_index == -1) { const string msg = @@ -490,7 +550,7 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx, for (int i = 0; i < ctx->num_outputs(); i++) { // Create an output tensor - const string output_name = StrCat(kOutputPHName, i); + const string output_name = StrCat(kOutputPHName, static_engine_ ? i : output_node_ids_[i]); const int binding_index = cuda_engine->getBindingIndex(output_name.c_str()); Tensor* output_tensor = nullptr; @@ -719,7 +779,7 @@ Status TRTEngineOp::AllocateCalibrationResources( "Unsupported data type encountered in input ", i); } cres->device_buffers_.emplace( - StrCat(kInputPHName, i), + StrCat(kInputPHName, static_engine_ ? i : input_node_ids_[i]), std::pair(device_address, device_tensor->TotalBytes())); } cres->calibrator_.reset( diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc index d859d5f957f..6205254c72a 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc @@ -23,10 +23,14 @@ limitations under the License. #include #include #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h" #include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h" #include "tensorflow/core/framework/fake_input.h" +#include "tensorflow/core/framework/function.h" +#include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/kernels/ops_testutil.h" @@ -47,7 +51,6 @@ class TRTEngineOpTestBase : public OpsTestBase { // Create the GPU device. std::unique_ptr device( DeviceFactory::NewDevice("GPU", {}, "/job:worker/replica:0/task:0")); - // Create simple TF graph. Scope s = Scope::NewRootScope(); auto feed = ops::Placeholder(s.WithOpName("TensorRTInputPH_0"), dtype, @@ -58,6 +61,32 @@ class TRTEngineOpTestBase : public OpsTestBase { // Serialize the graph. TRTEngineOp will convert it using dynamic mode. GraphDef graph_def; TF_ASSERT_OK(s.ToGraphDef(&graph_def)); + /* + //VLOG(0) << "Beginning TRTEngineOpTest new code"; + */ + const string func_name = "myop_native_segment"; + Graph* graph = s.graph(); + Graph sgraph(graph->flib_def()); + TF_ASSERT_OK(convert::ModifyGraphForFunctionDef( + graph, graph_def, &sgraph)); + TF_ASSERT_OK(convert::RegisterModifiedGraphToFunctionLibrary(&sgraph, graph, + flib_def_->ToProto(), "myop")); + //TF_ASSERT_OK(convert::RegisterSegmentFunctionToFunctionLibrary(graph, graph_def, "myop")); + + //FunctionDefLibrary fdeflib; + //VLOG(0) << "Before converting graph to function def"; + //auto native_segment = fdeflib.add_function(); + + //GraphToFunctionDef(*graph, func_name, native_segment); + //VLOG(0) << "After conversion from graph to func def"; + /*(*native_segment + ->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr] + .set_b(true); + */ + + //graph->AddFunctionLibrary(fdeflib); + //VLOG(0) << native_segment->DebugString(); + PartialTensorShape shape({-1, -1}); // Create the op. @@ -67,8 +96,8 @@ class TRTEngineOpTestBase : public OpsTestBase { .Attr("input_shapes", {shape}) .Attr("output_shapes", {shape}) .Attr("static_engine", false) - .Attr("segment_funcdef_name", "") // no native fallback - .Attr("serialized_segment", graph_def.SerializeAsString()) + .Attr("segment_funcdef_name", func_name) // no native fallback + .Attr("serialized_segment", "")//graph_def.SerializeAsString()) .Attr("calibration_data", "") .Attr("max_cached_engines_count", max_cached_engines_count) .Attr("workspace_size_bytes", 1 << 20) diff --git a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc new file mode 100644 index 00000000000..38b39804113 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc @@ -0,0 +1,172 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h" +//#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" +#include "tensorflow/core/common_runtime/graph_optimizer.h" +#include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/framework/node_def_builder.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/platform/logging.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/ascii.h" + +namespace tensorflow { +namespace tensorrt { + +const char* const kInputPHName = "TensorRTInputPH_"; +const char* const kOutputPHName = "TensorRTOutputPH_"; +const char* const kInputPHNameLower = "tensorrtinputph_"; +const char* const kOutputPHNameLower = "tensorrtoutputph_"; + +string NewNameWithIOPrefix(const Node* n) { + if (absl::StartsWith(n->name(), kInputPHNameLower)){ + return strings::StrCat(kInputPHName, n->id()); + } + else if (absl::StartsWith(n->name(), kOutputPHNameLower)) { + return strings::StrCat(kOutputPHName, n->id()); + } + return strings::StrCat("n", n->id()); +} + +void ToGraphDefWithIOPrefix(const Graph* g, GraphDef* gdef) { + // This is the same function as in function.cc. However, it uses the + // NewName mapping above, which retains IO prefixes (kInputPHName etc) + gtl::InlinedVector inputs; + gdef->Clear(); + *gdef->mutable_versions() = g->versions(); + + std::vector start_nodes; + for (Node* n : g->nodes()) { + if (n->out_edges().empty()) { + start_nodes.push_back(n); + } + } + + ReverseDFSFrom(*g, start_nodes, nullptr, [gdef, &inputs](Node* n) { + if (!n->IsOp()) return; + NodeDef* ndef = gdef->add_node(); + ndef->set_name(NewNameWithIOPrefix(n)); + ndef->set_op(n->type_string()); + for (const auto& attr : n->attrs()) { + (*ndef->mutable_attr())[attr.first] = attr.second; + } + + if (!n->assigned_device_name().empty()) { + ndef->set_device(n->assigned_device_name()); + } else { + ndef->set_device(n->requested_device()); + } + + inputs.clear(); + inputs.resize(n->num_inputs()); + for (const Edge* e : n->in_edges()) { + if (e->IsControlEdge()) { + inputs.push_back(e); + } else { + if (inputs[e->dst_input()] == nullptr) { + inputs[e->dst_input()] = e; + } else { + LOG(WARNING) << "Malformed graph node. multiple input edges: " + << n->DebugString(); + } + } + } + // node->name() is merely NodeDef::name, which are not guaranteed + // to be unique and stable after optimization rewrites. Therefore, + // we use "n or " instead. + for (const Edge* e : inputs) { + if (e == nullptr) { + ndef->add_input("unknown"); + continue; + } + const string srcname = NewNameWithIOPrefix(e->src()); + if (!e->src()->IsOp()) { + } else if (e->IsControlEdge()) { + ndef->add_input(strings::StrCat("^", srcname)); + } else if (e->src_output() == 0) { + ndef->add_input(srcname); + } else { + ndef->add_input(strings::StrCat(srcname, ":", e->src_output())); + } + } + }); +} + +Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, + FunctionLibraryRuntime* flib_runtime, + GraphDef* graph_def, + std::vector* input_node_ids, + std::vector* output_node_ids) { + const FunctionLibraryDefinition* flib_def = flib_runtime->GetFunctionLibraryDefinition(); + const FunctionBody* fbody; + VLOG(0) << "Getting Function Body \n"; + VLOG(0) << "HANDLE" << handle; + fbody = flib_runtime->GetFunctionBody(handle); + //TF_RET_CHECK(*fbody) + std::unique_ptr graph(new Graph(flib_def)); + + CopyGraph(*fbody->graph, graph.get()); + + // Copied from compiler/xla/compile_xla.cc : + /* + OptimizerOptions opts; + opts.set_opt_level(OptimizerOptions::L0); + opts.set_do_common_subexpression_elimination(false); + opts.set_do_function_inlining(true); + opts.set_do_constant_folding(true); + GraphOptimizer optimizer(opts); + auto cf_consider_fn = [](const Node* n) { + for (const auto& output_arg : n->op_def().output_arg()) { + if (output_arg.type() == DT_VARIANT) { + return false; + } + } + return true; + }; + GraphOptimizer::Options graph_optimizer_options; + graph_optimizer_options.cf_consider_fn = cf_consider_fn; + + */ + //optimizer.Optimize(flib_runtime, flib_runtime->env(), + // /*device=*/nullptr, &graph, graph_optimizer_options); + + for (Node* n : graph->nodes()) { + auto id = n->id(); + if (n->IsArg()) { + VLOG(1) << "Arg Node id " << id; + input_node_ids->push_back(id); + } + if (n->IsRetval()) { + VLOG(1) << "Retval Node id " << id; + output_node_ids->push_back(id); + } + } + + ToGraphDefWithIOPrefix(graph.release(), graph_def); + + for (const auto node_def : graph_def->node()) { + string node_name = node_def.name(); + VLOG(0) << "NODENAME AFTER FROM FUNCDEF " << node_name << ", op=" << node_def.op(); + } + VLOG(0) << "Finished converting \n"; + + return Status::OK(); + +} + +} +} diff --git a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h new file mode 100644 index 00000000000..ffc702679e0 --- /dev/null +++ b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h @@ -0,0 +1,42 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_FUNCDEF_TO_GRAPHDEF_H_ +#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_FUNCDEF_TO_GRAPHDEF_H_ + +#include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/framework/function.h" + +#if GOOGLE_CUDA +#if GOOGLE_TENSORRT + +namespace tensorflow { + +namespace tensorrt { + +string NewNameWithIOPrefix(const Node* n); +void ToGraphDefWithIOPrefix(const Graph* g, GraphDef* gdef); +Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, + FunctionLibraryRuntime* flib_runtime, + GraphDef* graph_def, + std::vector* input_node_ids, + std::vector* output_node_ids); + +} // namespace tensorrt +} // namespace tensorflow + +#endif +#endif +#endif diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py index 6b72cbec9bd..a15657dd640 100644 --- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py +++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py @@ -560,19 +560,19 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): num_engines += 1 segment_funcdef_name = node.attr["segment_funcdef_name"].s function_name = node.name + "_native_segment" - if IsQuantizationWithCalibration(run_params): + is_dynamic_engine = not node.attr["static_engine"].b + if IsQuantizationWithCalibration(run_params) or is_dynamic_engine: self.assertNotEmpty(segment_funcdef_name, node.name) self.assertIn(function_name, functions) else: - self.assertEmpty(segment_funcdef_name, node.name) - self.assertNotIn(function_name, functions) + #self.assertEmpty(segment_funcdef_name, node.name) + self.assertTrue(len(node.attr["serialized_segment"].s), node.name) + #self.assertNotIn(function_name, functions) self.assertIn(node.name, expected_engines) - self.assertTrue(len(node.attr["serialized_segment"].s), node.name) self.assertEqual( self._ToBytes(run_params.precision_mode), node.attr["precision_mode"].s, node.name) - is_dynamic_engine = not node.attr["static_engine"].b self.assertEqual(run_params.dynamic_engine, is_dynamic_engine, node.name) self.assertEqual(node.attr["use_calibration"].b, From abd460a8970e5d58350a2b56b54425aa1af4dea2 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Fri, 12 Jul 2019 14:50:40 -0700 Subject: [PATCH 0114/3053] Added error checking in trt_engine_op.cc --- .../tf2tensorrt/kernels/trt_engine_op.cc | 22 +++---------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index 2b569d177e1..4ac788a6c3c 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -264,22 +264,10 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) context->GetAttr("use_calibration", &use_calibration_)); native_func_ = kInvalidHandle; if (!static_engine_) { - //TODO(phillip-kravtsov) error checking here: how? - VLOG(0) << "Funcdef_name: " << funcdef_name_; - VLOG(0) << "Static Engine? " << static_engine_; - Status status = ConstructFunctionHandle(context); - VLOG(0) << "Status: " << status; + OP_REQUIRES_OK(context, ConstructFunctionHandle(context)); FunctionLibraryRuntime* lib = context->function_library(); - VLOG(0) << "Funcdef to graphdef"; - FunctionDefToGraphDef(native_func_, lib, &segment_graph_, - &input_node_ids_, &output_node_ids_); - for (int id : input_node_ids_) { - VLOG(0) << "Input node id: " << id << " from engine " << name(); - } - for (int id : output_node_ids_) { - VLOG(0) << "Output node id: " << id << " from engine " << name(); - } - + OP_REQUIRES_OK(context, FunctionDefToGraphDef(native_func_, lib, &segment_graph_, + &input_node_ids_, &output_node_ids_)); } calibration_mode_ = (use_calibration_ && precision_mode_ == TrtPrecisionMode::INT8 && @@ -500,10 +488,6 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx, // input. const int num_batch = ctx->input(0).shape().dim_size(0); const int num_binding = ctx->num_inputs() + ctx->num_outputs(); - for (int i = 0; i < num_binding; i++) { - auto binding_name = cuda_engine->getBindingName(i); - VLOG(0) << "Binding name for index " << i << " " << binding_name; - } std::vector buffers(num_binding); From 5f01e19d0463f19c59060bfece6b516f23bb8e69 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Fri, 12 Jul 2019 15:24:18 -0700 Subject: [PATCH 0115/3053] Removed use_function_backup parameter. --- .../tf2tensorrt/convert/convert_graph.cc | 21 ++++-------- .../tf2tensorrt/convert/convert_graph.h | 2 -- .../convert/trt_optimization_pass.cc | 4 --- .../convert/trt_optimization_pass.h | 5 +-- .../tensorrt/test/quantization_mnist_test.py | 3 +- .../test/tf_trt_integration_test_base.py | 7 ++-- .../python/compiler/tensorrt/trt_convert.py | 21 +----------- .../compiler/tensorrt/trt_convert_test.py | 32 +++++++------------ 8 files changed, 22 insertions(+), 73 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index 0c2831df275..3f029161954 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -463,7 +463,6 @@ Status CreateTRTNode(const ConversionParams& params, } NodeDef trt_node; - //TODO(phillip-kravtsov): use_function_backup: fix this Status status = node_builder.Attr("input_shapes", input_shape_protos) .Attr("output_shapes", output_shape_protos) @@ -634,10 +633,9 @@ Status RegisterModifiedGraphToFunctionLibrary(Graph* sgraph, Graph* graph, (*native_segment ->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr] .set_b(true); - //TODO(phillip-kravtsov): set this back to 7 - if (VLOG_IS_ON(0)) { - VLOG(0) << engine_name << " Function_Def "; - VLOG(0) << native_segment->DebugString(); + if (VLOG_IS_ON(7)) { + VLOG(7) << engine_name << " Function_Def "; + VLOG(7) << native_segment->DebugString(); } VLOG(1) << "Adding funcdef to graphlib"; TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdeflib)); @@ -697,16 +695,9 @@ std::pair GetDeviceAndAllocator(const ConversionParams& params, // Entry function from optimization pass. Status ConvertAfterShapes(const ConversionParams& params) { // Sanity checks. - if (params.precision_mode == TrtPrecisionMode::INT8) { - if (params.use_calibration && !params.use_function_backup) { - return errors::InvalidArgument( - "Calibration requires enabling fallback to TF function execution."); - } - } else { - if (params.use_calibration) { - return errors::InvalidArgument( - "Calibration with FP32 or FP16 is not supported."); - } + if (params.precision_mode != TrtPrecisionMode::INT8 && params.use_calibration) { + return errors::InvalidArgument( + "Calibration requires enabling fallback to TF function execution."); } // Convert graphdef to graph. diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h index 74135e56cf4..f7674fb367c 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h @@ -47,8 +47,6 @@ struct ConversionParams { // maximum number of cached engines int max_cached_engines = 1; bool use_calibration = true; - // Whether to use function fallback for TRTEngineOp - bool use_function_backup = true; }; // Method to call from optimization pass diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc index 6af483d37cf..6296851d378 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc @@ -67,9 +67,6 @@ Status TRTOptimizationPass::Init( if (params.count("use_calibration")) { use_calibration_ = params.at("use_calibration").b(); } - if (params.count("use_function_backup")) { - use_function_backup_ = params.at("use_function_backup").b(); - } return Status::OK(); } @@ -259,7 +256,6 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster, cp.is_dyn_op = is_dynamic_op_; cp.max_cached_engines = max_cached_batches_; cp.use_calibration = use_calibration_; - cp.use_function_backup = use_function_backup_; auto status = ConvertAfterShapes(cp); VLOG(1) << "Returning from " << name_; return status; diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h index d3fd914b302..dbed5354f15 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h +++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.h @@ -40,8 +40,7 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer { is_dynamic_op_(false), max_cached_batches_(1), max_workspace_size_bytes_(256LL << 20), - use_calibration_(true), - use_function_backup_(true) { + use_calibration_(true) { VLOG(1) << "Constructing " << name_; } @@ -71,8 +70,6 @@ class TRTOptimizationPass : public grappler::CustomGraphOptimizer { int64_t max_workspace_size_bytes_; bool use_calibration_; - // Whether to allow TF function fallback path in TRTEngineOp. - bool use_function_backup_; }; } // namespace convert diff --git a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py index 56994617b90..d44a0ec7156 100644 --- a/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py +++ b/tensorflow/python/compiler/tensorrt/test/quantization_mnist_test.py @@ -153,8 +153,7 @@ class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase): # runtime to allocate GPU memory. max_workspace_size_bytes=1 << 28, minimum_segment_size=2, - use_calibration=False, - use_function_backup=False) + use_calibration=False) graph_def = converter.convert() logging.info('Number of nodes after TF-TRT conversion: %d', len(graph_def.node)) diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py index a15657dd640..a41f965573a 100644 --- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py +++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py @@ -234,10 +234,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): is_dynamic_op=run_params.dynamic_engine, maximum_cached_engines=1, use_calibration=run_params.use_calibration, - use_function_backup=False, max_batch_size=min(batch_list)) - return conversion_params._replace( - use_function_backup=IsQuantizationWithCalibration(conversion_params)) + return conversion_params def ShouldRunTest(self, run_params): """Whether to run the test.""" @@ -388,8 +386,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): minimum_segment_size=conversion_params.minimum_segment_size, is_dynamic_op=conversion_params.is_dynamic_op, maximum_cached_engines=conversion_params.maximum_cached_engines, - use_calibration=conversion_params.use_calibration, - use_function_backup=conversion_params.use_function_backup) + use_calibration=conversion_params.use_calibration) def _GetCalibratedInferGraph(self, run_params, saved_model_dir, inputs_data): """Return trt converted graphdef in INT8 mode.""" diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py index 982c4fea641..58b00be5350 100644 --- a/tensorflow/python/compiler/tensorrt/trt_convert.py +++ b/tensorflow/python/compiler/tensorrt/trt_convert.py @@ -144,11 +144,6 @@ TrtConversionParams = collections.namedtuple( # trained with fake quantization. "use_calibration", - # If set to True, it will create a FunctionDef for each subgraph that is - # converted to TRT op, and if TRT ops fail to execute at runtime, it'll - # invoke that function as a fallback. - "use_function_backup", - # Max size for the input batch. # This option is deprecated in TF 2.0. "max_batch_size", @@ -162,7 +157,6 @@ DEFAULT_TRT_CONVERSION_PARAMS = TrtConversionParams( is_dynamic_op=False, maximum_cached_engines=1, use_calibration=True, - use_function_backup=True, max_batch_size=1) _TRT_ENGINE_CACHE_CONTAINER_NAME = "TF-TRT-Engine-Cache" @@ -269,8 +263,6 @@ def get_tensorrt_rewriter_config( "maximum_cached_engines"].i = conversion_params.maximum_cached_engines optimizer.parameter_map[ "use_calibration"].b = conversion_params.use_calibration - optimizer.parameter_map[ - "use_function_backup"].b = conversion_params.use_function_backup if is_v2: # Static mode (a.k.a pre-generating TRT engines and make them node @@ -328,8 +320,7 @@ class TrtGraphConverter(object): minimum_segment_size=3, is_dynamic_op=False, maximum_cached_engines=1, - use_calibration=True, - use_function_backup=True): + use_calibration=True): """Initialize the converter. Args: @@ -368,9 +359,6 @@ class TrtGraphConverter(object): will occur. Please note that accuracy may be negatively affected if there is a mismatch between which tensors TRT quantizes and which tensors were trained with fake quantization. - use_function_backup: if set to True, it will create a FunctionDef for each - subgraph that is converted to TRT op, and if TRT ops fail to execute at - runtime, it'll invoke that function as a fallback. Raises: ValueError: if the combination of the parameters is invalid. @@ -408,12 +396,6 @@ class TrtGraphConverter(object): "dynamic TRT ops only. Disregarding is_dynamic_op parameter.") is_dynamic_op = True - # TODO(laigd): consider provide a mechanism to remove the fallback path - # after calibration is done. - if self._need_calibration and not use_function_backup: - raise ValueError( - "Calibration requires enabling fallback to TF function execution.") - # TODO(laigd): # - Verify in int8 mode that maximum_cached_engines is set properly. # - If it fails to build the int8 engine it should return error. @@ -430,7 +412,6 @@ class TrtGraphConverter(object): is_dynamic_op=is_dynamic_op, maximum_cached_engines=maximum_cached_engines, use_calibration=use_calibration, - use_function_backup=use_function_backup, max_batch_size=max_batch_size) _check_conversion_params(self._conversion_params) diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py index 61ecd79beb2..cdd24ce041e 100644 --- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py +++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py @@ -200,8 +200,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase): max_batch_size=1, minimum_segment_size=3, is_dynamic_op=False, - maximum_cached_engines=1, - use_function_backup=False): + maximum_cached_engines=1): """Helper method to convert a GraphDef or SavedModel using TF-TRT.""" converter = trt_convert.TrtGraphConverter( input_saved_model_dir=input_saved_model_dir, @@ -215,8 +214,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase): else trt_convert.TrtPrecisionMode.FP32), minimum_segment_size=minimum_segment_size, is_dynamic_op=is_dynamic_op, - maximum_cached_engines=maximum_cached_engines, - use_function_backup=use_function_backup) + maximum_cached_engines=maximum_cached_engines) output_graph_def = converter.convert() if need_calibration: @@ -249,8 +247,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase): input_saved_model_dir=input_saved_model_dir, output_saved_model_dir=output_saved_model_dir, need_calibration=need_calibration, - is_dynamic_op=is_dynamic_op, - use_function_backup=need_calibration) + is_dynamic_op=is_dynamic_op) graph_defs_to_verify = [output_graph_def] if output_saved_model_dir: @@ -314,8 +311,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase): conversion_params=trt_convert.DEFAULT_TRT_CONVERSION_PARAMS._replace( precision_mode=trt_convert.TrtPrecisionMode.FP32, is_dynamic_op=True, - maximum_cached_engines=2, - use_function_backup=False)) + maximum_cached_engines=2)) @test_util.run_v2_only def testTrtGraphConverter_BasicConversion_v2(self): @@ -445,7 +441,6 @@ class TrtConvertTest(test_util.TensorFlowTestCase): def _TestRun(self, sess, batch_size, - use_function_backup=False, expect_engine_is_run=True): try: result = sess.run( @@ -454,7 +449,8 @@ class TrtConvertTest(test_util.TensorFlowTestCase): except errors.OpError as e: # This should happen only when fallback path is disabled and TRT engine # fails to run. - self.assertTrue(not use_function_backup and not expect_engine_is_run) + # TODO(phillip-kravtsov) Check what correct handling is + #self.assertTrue(not use_function_backup and not expect_engine_is_run) self.assertIn("Fallback path is disabled, for TRTEngineOp_0", str(e)) @test_util.deprecated_graph_mode_only @@ -486,8 +482,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase): input_saved_model_dir=input_saved_model_dir, output_saved_model_dir=output_saved_model_dir, is_dynamic_op=True, - maximum_cached_engines=2, - use_function_backup=False) # Disallow fallback. + maximum_cached_engines=2) # Test the output GraphDef. with ops.Graph().as_default(): @@ -513,7 +508,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase): # the max, it should evict an old engine and create a new one. self._TestRun(sess, 3) - def _TestStaticOp(self, use_function_backup): + def _TestStaticOp(self): if not is_tensorrt_enabled(): return @@ -524,8 +519,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase): output_graph_def = self._ConvertGraph( input_saved_model_dir=input_saved_model_dir, output_saved_model_dir=output_saved_model_dir, - maximum_cached_engines=2, # This is noop, added just for testing. - use_function_backup=use_function_backup) + maximum_cached_engines=2) # Test the output GraphDef. with ops.Graph().as_default(): @@ -536,14 +530,12 @@ class TrtConvertTest(test_util.TensorFlowTestCase): self._TestRun( sess, 1, - use_function_backup=use_function_backup, expect_engine_is_run=True) # Run with batch size 2, which exceed the max_batch_size, it should try # to fall back to TF function. self._TestRun( sess, 2, - use_function_backup=use_function_backup, expect_engine_is_run=False) # Test the output SavedModel @@ -555,23 +547,21 @@ class TrtConvertTest(test_util.TensorFlowTestCase): self._TestRun( sess, 1, - use_function_backup=use_function_backup, expect_engine_is_run=True) # Run with batch size 2, which exceed the max_batch_size, it should try # to fall back to TF function. self._TestRun( sess, 2, - use_function_backup=use_function_backup, expect_engine_is_run=False) @test_util.deprecated_graph_mode_only def testTrtGraphConverter_StaticOp_NoFallback(self): - self._TestStaticOp(use_function_backup=False) + self._TestStaticOp() @test_util.deprecated_graph_mode_only def testTrtGraphConverter_StaticOp_WithFallback(self): - self._TestStaticOp(use_function_backup=True) + self._TestStaticOp() if __name__ == "__main__": From 6263eb5307080e20453cec0d0e7f35fe36a13989 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Fri, 12 Jul 2019 15:33:32 -0700 Subject: [PATCH 0116/3053] Removed excessively verbose logging from trt. --- tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc | 1 - tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc | 2 +- tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc | 4 ---- tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc | 4 ---- 4 files changed, 1 insertion(+), 10 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index 3f029161954..112966acb40 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -135,7 +135,6 @@ Status GetEngineInfo(const Graph* g, DeviceNameUtils::ParsedName parsed_name; const bool parse_succeeded = DeviceNameUtils::ParseFullName(node_device, &parsed_name); - VLOG(0) << node_device; if (!parse_succeeded || (parse_succeeded && parsed_name.type == "CPU")) { string msg; if (!parse_succeeded) { diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index 4ac788a6c3c..a329c8c6d78 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -216,7 +216,7 @@ Status TRTEngineOp::ConstructFunctionHandle(OpKernelConstruction* ctx) { } auto func_names = lib->GetFunctionLibraryDefinition()->ListFunctionNames(); for (auto func_name : func_names) { - VLOG(0) << "Func name: " << func_name; + VLOG(2) << "Func name: " << func_name; } auto fdef = lib->GetFunctionLibraryDefinition()->Find(funcdef_name_); if (fdef == nullptr) { diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc index 6205254c72a..dc31e5c156e 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc @@ -62,7 +62,6 @@ class TRTEngineOpTestBase : public OpsTestBase { GraphDef graph_def; TF_ASSERT_OK(s.ToGraphDef(&graph_def)); /* - //VLOG(0) << "Beginning TRTEngineOpTest new code"; */ const string func_name = "myop_native_segment"; Graph* graph = s.graph(); @@ -74,18 +73,15 @@ class TRTEngineOpTestBase : public OpsTestBase { //TF_ASSERT_OK(convert::RegisterSegmentFunctionToFunctionLibrary(graph, graph_def, "myop")); //FunctionDefLibrary fdeflib; - //VLOG(0) << "Before converting graph to function def"; //auto native_segment = fdeflib.add_function(); //GraphToFunctionDef(*graph, func_name, native_segment); - //VLOG(0) << "After conversion from graph to func def"; /*(*native_segment ->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr] .set_b(true); */ //graph->AddFunctionLibrary(fdeflib); - //VLOG(0) << native_segment->DebugString(); PartialTensorShape shape({-1, -1}); diff --git a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc index 38b39804113..af76d84b232 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc +++ b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc @@ -113,8 +113,6 @@ Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, std::vector* output_node_ids) { const FunctionLibraryDefinition* flib_def = flib_runtime->GetFunctionLibraryDefinition(); const FunctionBody* fbody; - VLOG(0) << "Getting Function Body \n"; - VLOG(0) << "HANDLE" << handle; fbody = flib_runtime->GetFunctionBody(handle); //TF_RET_CHECK(*fbody) std::unique_ptr graph(new Graph(flib_def)); @@ -160,9 +158,7 @@ Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, for (const auto node_def : graph_def->node()) { string node_name = node_def.name(); - VLOG(0) << "NODENAME AFTER FROM FUNCDEF " << node_name << ", op=" << node_def.op(); } - VLOG(0) << "Finished converting \n"; return Status::OK(); From 27098fb159eb88e84eb47f3bfa4e9ef67316a8bd Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Tue, 16 Jul 2019 11:08:37 -0700 Subject: [PATCH 0117/3053] Mild cleanup. --- tensorflow/compiler/tf2tensorrt/BUILD | 1 - tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD index bca101c4a53..7490f4e8d15 100644 --- a/tensorflow/compiler/tf2tensorrt/BUILD +++ b/tensorflow/compiler/tf2tensorrt/BUILD @@ -253,7 +253,6 @@ tf_cuda_library( ":utils", "//tensorflow/core:framework_headers_lib", "//tensorflow/core:framework_lite", - #"//tensorflow/core:framework", "//tensorflow/core/grappler:op_types", "//tensorflow/core:graph", "//tensorflow/core:gpu_runtime", diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index 112966acb40..6dbd210316b 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -441,7 +441,6 @@ Status CreateTRTNode(const ConversionParams& params, segment_string = string(static_cast(engine_data->data()), engine_data->size()); } else { - //segment_string = info.segment_graph_def.SerializeAsString(); segment_string = ""; } @@ -540,7 +539,8 @@ Status CreateTRTNode(const ConversionParams& params, Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, Graph* sgraph) { - //Graph sgraph(graph->flib_def()); + // sgraph is a graph for the segment, to be modified by this function + // graph is the input graph to be optimized by TRT. GraphConstructorOptions gcopts; TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(gcopts, segment, sgraph)); std::map io_nodes; From 40d5fbe0ad1bf81b278181201006dc92755b8a97 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Tue, 16 Jul 2019 14:54:06 -0700 Subject: [PATCH 0118/3053] More mild cleanup, removed unnecessary static condition.y --- tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc | 3 +-- tensorflow/compiler/tf2tensorrt/convert/convert_graph.h | 4 ---- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index 6dbd210316b..a1234b56e0a 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -783,8 +783,7 @@ Status ConvertAfterShapes(const ConversionParams& params) { engine_segments.push_back(std::move(curr_engine)); converted_segments.push_back(std::move(curr_segment)); - if (VLOG_IS_ON(8) && - curr_engine.engine_type == EngineInfo::EngineType::TRTStatic) { + if (VLOG_IS_ON(8)) { string fname = engine_segments.back().engine_name; StrAppend(&fname, ".pb"); std::fstream f; diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h index f7674fb367c..25bcb345ce5 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h @@ -56,10 +56,6 @@ Status ConvertAfterShapes(const ConversionParams& params); std::pair GetDeviceAndAllocator(const ConversionParams& params, const EngineInfo& engine); -/*Status RegisterSegmentFunctionToFunctionLibrary(Graph* graph, - const GraphDef& segment, - const string& engine_name); - */ Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, Graph* sgraph); From bb5ebbec9872d8e11d71bbc22bddc3d7458804ce Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Tue, 16 Jul 2019 20:06:04 -0700 Subject: [PATCH 0119/3053] Moved constant IO strings into class. Renamed method in funcdef_to_graphdef. Formatted, removed commenting. --- .../tf2tensorrt/convert/convert_graph.cc | 43 +++--- .../tf2tensorrt/convert/convert_graph.h | 7 +- .../tf2tensorrt/convert/convert_nodes.cc | 132 +++++++++--------- .../tf2tensorrt/convert/convert_nodes.h | 4 +- .../compiler/tf2tensorrt/convert/utils.h | 8 ++ .../tf2tensorrt/kernels/trt_engine_op.cc | 54 ++++--- .../tf2tensorrt/kernels/trt_engine_op_test.cc | 19 +-- .../tf2tensorrt/utils/funcdef_to_graphdef.cc | 74 ++++------ .../tf2tensorrt/utils/funcdef_to_graphdef.h | 13 +- .../test/tf_trt_integration_test_base.py | 2 - .../compiler/tensorrt/trt_convert_test.py | 2 - 11 files changed, 159 insertions(+), 199 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index a1234b56e0a..74d4da6df73 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -49,9 +49,9 @@ limitations under the License. #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" -#include "tensorflow/core/protobuf/config.pb.h" // NOLINT +#include "tensorflow/core/protobuf/config.pb.h" // NOLINT #include "tensorflow/core/protobuf/device_properties.pb.h" // NOLINT -#include "tensorflow/core/protobuf/rewriter_config.pb.h" // NOLINT +#include "tensorflow/core/protobuf/rewriter_config.pb.h" // NOLINT #include "tensorflow/core/util/device_name_utils.h" #if GOOGLE_CUDA @@ -66,6 +66,8 @@ using absl::StrCat; namespace { +//auto prefixes = IONamePrefixes(); + Status BuildNodeMap(const Graph& graph, std::unordered_map* node_map) { for (auto* node : graph.op_nodes()) { @@ -466,7 +468,8 @@ Status CreateTRTNode(const ConversionParams& params, .Attr("output_shapes", output_shape_protos) .Attr("static_engine", info.engine_type == EngineInfo::EngineType::TRTStatic) - .Attr("segment_funcdef_name", StrCat(info.engine_name, "_native_segment")) + .Attr("segment_funcdef_name", + StrCat(info.engine_name, "_native_segment")) .Attr("serialized_segment", segment_string) .Attr("calibration_data", "") .Attr("max_cached_engines_count", info.maximum_cached_engines) @@ -536,8 +539,7 @@ Status CreateTRTNode(const ConversionParams& params, } // Function to construct a funcdef from the segment and add it to the graph. -Status ModifyGraphForFunctionDef(Graph* graph, - const GraphDef& segment, +Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, Graph* sgraph) { // sgraph is a graph for the segment, to be modified by this function // graph is the input graph to be optimized by TRT. @@ -546,16 +548,16 @@ Status ModifyGraphForFunctionDef(Graph* graph, std::map io_nodes; int num_inputs = 0; for (auto n : sgraph->op_nodes()) { - if (absl::StartsWith(n->name(), kInputPHName)) { + if (absl::StartsWith(n->name(), prefixes.kInputPHName)) { num_inputs++; io_nodes.insert({n->name(), n}); - } else if (absl::StartsWith(n->name(), kOutputPHName)) { + } else if (absl::StartsWith(n->name(), prefixes.kOutputPHName)) { io_nodes.insert({n->name(), n}); } } for (int i = 0; i < num_inputs; ++i) { - auto name = StrCat(kInputPHName, i); + auto name = StrCat(prefixes.kInputPHName, i); auto node = io_nodes[name]; NodeDef nd; NodeDefBuilder node_builder(StrCat(name, "_Arg"), @@ -582,7 +584,7 @@ Status ModifyGraphForFunctionDef(Graph* graph, } for (int i = 0; i < io_nodes.size() - num_inputs; ++i) { - auto name = StrCat(kOutputPHName, i); + auto name = StrCat(prefixes.kOutputPHName, i); auto node = io_nodes[name]; NodeDef nd; NodeDefBuilder node_builder(StrCat(name, "_Ret"), @@ -694,7 +696,8 @@ std::pair GetDeviceAndAllocator(const ConversionParams& params, // Entry function from optimization pass. Status ConvertAfterShapes(const ConversionParams& params) { // Sanity checks. - if (params.precision_mode != TrtPrecisionMode::INT8 && params.use_calibration) { + if (params.precision_mode != TrtPrecisionMode::INT8 && + params.use_calibration) { return errors::InvalidArgument( "Calibration requires enabling fallback to TF function execution."); } @@ -717,9 +720,8 @@ Status ConvertAfterShapes(const ConversionParams& params) { TrtNodeValidator validator(*params.graph_properties, params.precision_mode, params.use_calibration); TF_RETURN_IF_ERROR(segment::SegmentGraph( - &graph, - std::bind(&TrtNodeValidator::IsTensorRTCandidate, &validator, - std::placeholders::_1), + &graph, std::bind(&TrtNodeValidator::IsTensorRTCandidate, &validator, + std::placeholders::_1), // Input validation is already done by TrtNodeValidator, so we don't // need to check the input edges. [](const Edge* edge) { return true; }, OutputEdgeValidator(), @@ -757,23 +759,22 @@ Status ConvertAfterShapes(const ConversionParams& params) { : EngineInfo::EngineType::TRTStatic); curr_engine.use_calibration = params.use_calibration; curr_engine.maximum_cached_engines = params.max_cached_engines; - Graph sgraph(flib); status = ModifyGraphForFunctionDef(&graph, curr_engine.segment_graph_def, &sgraph); if (!status.ok()) { - LOG(WARNING) << "Failed to modify graph as a function " - << t << ": " << status; + LOG(WARNING) << "Failed to modify graph as a function " << t << ": " + << status; continue; } FunctionDefLibrary fdeflib; - status = RegisterModifiedGraphToFunctionLibrary(&sgraph, &graph, - fdeflib, curr_engine.engine_name); - + status = RegisterModifiedGraphToFunctionLibrary(&sgraph, &graph, fdeflib, + curr_engine.engine_name); + if (!status.ok()) { - LOG(WARNING) << "Failed to register segment graphdef as a function " - << t << ": " << status; + LOG(WARNING) << "Failed to register segment graphdef as a function " << t + << ": " << status; continue; } diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h index 25bcb345ce5..b4f3849a93a 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h @@ -18,8 +18,8 @@ limitations under the License. #include #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" -#include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/framework/function.pb.h" +#include "tensorflow/core/framework/graph.pb.h" #include "tensorflow/core/grappler/clusters/cluster.h" #include "tensorflow/core/grappler/costs/graph_properties.h" #include "tensorflow/core/lib/core/status.h" @@ -32,6 +32,8 @@ namespace tensorflow { namespace tensorrt { namespace convert { +// extern const IONamePrefixes prefixes; + struct ConversionParams { const GraphDef* input_graph_def = nullptr; const std::vector* output_names = nullptr; @@ -56,8 +58,7 @@ Status ConvertAfterShapes(const ConversionParams& params); std::pair GetDeviceAndAllocator(const ConversionParams& params, const EngineInfo& engine); -Status ModifyGraphForFunctionDef(Graph* graph, - const GraphDef& segment, +Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, Graph* sgraph); Status RegisterModifiedGraphToFunctionLibrary(Graph* sgraph, Graph* graph, diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc index efb186c4c55..784b29470f6 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc @@ -77,18 +77,15 @@ limitations under the License. namespace tensorflow { namespace tensorrt { -// TODO(aaroey): put these constants into some class. -const char* const kInputPHName = "TensorRTInputPH_"; -const char* const kOutputPHName = "TensorRTOutputPH_"; +namespace convert { bool IsEngineInput(absl::string_view name) { - return absl::StartsWith(name, kInputPHName); + return absl::StartsWith(name, prefixes.kInputPHName); } bool IsEngineOutput(absl::string_view name) { - return absl::StartsWith(name, kOutputPHName); + return absl::StartsWith(name, prefixes.kOutputPHName); } -namespace convert { using absl::StrAppend; using absl::StrCat; @@ -364,9 +361,9 @@ string DebugString(const nvinfer1::Permutation& permutation, int len) { string DebugString(const nvinfer1::ITensor& tensor) { return StrCat("nvinfer1::ITensor(@", reinterpret_cast(&tensor), - ", name=", tensor.getName(), - ", dtype=", DebugString(tensor.getType()), - ", dims=", DebugString(tensor.getDimensions()), ")"); + ", name=", tensor.getName(), ", dtype=", + DebugString(tensor.getType()), ", dims=", + DebugString(tensor.getDimensions()), ")"); } Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l, @@ -444,11 +441,10 @@ Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l, for (int i = 0; i < broadcast_num_dims; ++i) { if ((output_l[i] != output_r[i]) && (output_l[i] != 1) && (output_r[i] != 1)) { - return errors::InvalidArgument("Infeasible broadcast scheme (", - "batch_dim: ", output_l[0], ", ", - DebugString(*operand_l_new_dims), " vs ", - "batch_dim: ", output_r[0], ", ", - DebugString(*operand_r_new_dims), ")"); + return errors::InvalidArgument( + "Infeasible broadcast scheme (", "batch_dim: ", output_l[0], ", ", + DebugString(*operand_l_new_dims), " vs ", "batch_dim: ", + output_r[0], ", ", DebugString(*operand_r_new_dims), ")"); } } } @@ -716,8 +712,8 @@ size_t TRT_ShapedWeights::size_bytes() const { string TRT_ShapedWeights::DebugString() const { return StrCat("TRT_ShapedWeights(shape=", convert::DebugString(shape_), - ", type=", convert::DebugString(type_), - ", values=", reinterpret_cast(GetValues()), ")"); + ", type=", convert::DebugString(type_), ", values=", + reinterpret_cast(GetValues()), ")"); } // A fake ITensor implementation used to check whether the TF-TRT converter can @@ -986,10 +982,8 @@ OpConverterParams::OpConverterParams( use_calibration(converter->use_calibration()) {} const std::set* TrtNodeValidator::quantize_ops = new std::set{ - "QuantizeAndDequantizeV2", - "QuantizeAndDequantizeV3", - "FakeQuantWithMinMaxVars", - "FakeQuantWithMinMaxArgs", + "QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3", + "FakeQuantWithMinMaxVars", "FakeQuantWithMinMaxArgs", }; TrtNodeValidator::TrtNodeValidator( @@ -1068,9 +1062,9 @@ Status TrtNodeValidator::IsTensorRTCandidate(const Node* node) { Status status = ConvertToTensorOrWeights(src_def, edge->src_output(), &tensor_or_weights); if (!status.ok()) { - return errors::Internal( - "Failed to convert input ", src_def.name(), - " to a TRT_TensorOrWeights: ", status.error_message()); + return errors::Internal("Failed to convert input ", src_def.name(), + " to a TRT_TensorOrWeights: ", + status.error_message()); } inputs.push_back(tensor_or_weights); } @@ -1369,9 +1363,9 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input, // CreateConstantLayer. So we can treat it as a tensor for // AreDimsStaticWithDifferentSize(). This really only matters for 0-D tensors. if (AreDimsStaticWithDifferentSize(input_dims, dims, /*is_tensor=*/true)) { - return errors::InvalidArgument( - "Incompatible shapes: ", DebugString(input_dims), " vs. ", - DebugString(dims)); + return errors::InvalidArgument("Incompatible shapes: ", + DebugString(input_dims), " vs. ", + DebugString(dims)); } // ConstantLayer requires static shapes (cannot infer -1). if (input.is_weights() && !HasStaticShape(dims)) { @@ -1461,7 +1455,7 @@ void Converter::MaybeApplyQuantizationRanges() { // Infer ranges across marked ops. PropagateQuantizationRanges(); - // Apply ranges. +// Apply ranges. #if IS_TRT_VERSION_GE(5, 0, 0, 0) for (auto pair : quantization_ranges_) { nvinfer1::ITensor* tensor = pair.first; @@ -1516,19 +1510,15 @@ void Converter::MaybeApplyQuantizationRanges() { const std::vector>> fused_patterns = { {"Fused Conv+Bias+Activation", { - IsConvolution, - IsScale, - IsClipOrRelu, + IsConvolution, IsScale, IsClipOrRelu, }}, {"Fused Conv+Bias", { - IsConvolution, - IsScale, + IsConvolution, IsScale, }}, {"Fused Conv+Activation", { - IsConvolution, - IsClipOrRelu, + IsConvolution, IsClipOrRelu, }}, }; for (int i = 0; i < this->network()->getNbLayers(); i++) { @@ -2108,11 +2098,11 @@ Status ConvertReshape(OpConverterParams* params) { << "\nreshape_batch_dim=" << reshape_batch_dim << ", reshape_dims=" << DebugString(reshape_dims); if (reshape_may_change_batch_dim) { - const string msg = StrCat( - "Reshape on batch dimension is not supported, at ", node_def.name(), - ". input_batch_dim=", input_batch_dim, ", ", DebugString(input_dims), - "; reshape_batch_dim=", reshape_batch_dim, ", ", - DebugString(reshape_dims)); + const string msg = + StrCat("Reshape on batch dimension is not supported, at ", + node_def.name(), ". input_batch_dim=", input_batch_dim, ", ", + DebugString(input_dims), "; reshape_batch_dim=", + reshape_batch_dim, ", ", DebugString(reshape_dims)); return errors::Unimplemented(msg); } @@ -2820,7 +2810,7 @@ Status ConvertActivation(OpConverterParams* params) { params->converter->network()->addActivation(*inputs.at(0).tensor(), op_pair->second); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - // Set parameters. +// Set parameters. #if IS_TRT_VERSION_GE(5, 1, 2, 0) if (node_def.op() == "Elu") { layer->setAlpha(1.0f); @@ -4111,8 +4101,8 @@ Status ConvertGather(OpConverterParams* params) { if (trt_gather_output_dims.nbDims != expected_trt_output_rank) { return errors::Internal( "Get unexpected output dimensions of IGatherLayer. Expect nbDims: ", - expected_trt_output_rank, - ", actual nbDims: ", trt_gather_output_dims.nbDims); + expected_trt_output_rank, ", actual nbDims: ", + trt_gather_output_dims.nbDims); } // Reshape the output so after adding the implicit batch dim it'll match the // output shape of TF GatherV2. @@ -4211,8 +4201,9 @@ Status ConvertMatMulHelper(OpConverterParams* params, input_b.GetTrtDims().nbDims == 2; // If int8 is specified, FC must be used unless it is not compatible, as MM // does not support int8 at this time. - if (should_use_fc || (can_use_fc && params->converter->precision_mode() == - TrtPrecisionMode::INT8)) { + if (should_use_fc || + (can_use_fc && + params->converter->precision_mode() == TrtPrecisionMode::INT8)) { return ConvertFullyConnectedHelper( params, input_a.tensor(), input_b.weights(), transpose_b, node_name); } @@ -4228,9 +4219,8 @@ Status ConvertMatMulHelper(OpConverterParams* params, // If the MatMul operand is a constant, applies transposes at conversion-time // as necessary. If the operand is a tensor, does nothing. If required // transposes were applied, sets transpose to false. - const auto prepare_matmul_operand = - [¶ms](TRT_TensorOrWeights operand, - bool* transpose) -> nvinfer1::ITensor* { + const auto prepare_matmul_operand = [¶ms]( + TRT_TensorOrWeights operand, bool* transpose) -> nvinfer1::ITensor* { if (operand.is_tensor()) { return operand.tensor(); } else { @@ -4312,19 +4302,18 @@ Status ConvertBatchMatMul(OpConverterParams* params) { // Tensor with TF Dims: 12 5 3 -> TRT Dims: 5 3 // Weight with TF Dims: 12 3 6 -> TRT Dims: 12 3 6 // It is not possible to treat the weight input as a batched [3, 6] tensor. - const auto check_weight_is_not_batched = - [](const TRT_TensorOrWeights& input_l, - const TRT_TensorOrWeights& input_r) { - // If input_l is a weight, then input_r must be a tensor because - // otherwise the op would be handled by Grappler. - if (input_l.is_weights() && - input_l.GetTrtDims().nbDims > input_r.GetTrtDims().nbDims && - input_l.GetTrtDims().d[0] != 1) { - return errors::Unimplemented( - "TensorRT does not support batched constants."); - } - return Status::OK(); - }; + const auto check_weight_is_not_batched = []( + const TRT_TensorOrWeights& input_l, const TRT_TensorOrWeights& input_r) { + // If input_l is a weight, then input_r must be a tensor because + // otherwise the op would be handled by Grappler. + if (input_l.is_weights() && + input_l.GetTrtDims().nbDims > input_r.GetTrtDims().nbDims && + input_l.GetTrtDims().d[0] != 1) { + return errors::Unimplemented( + "TensorRT does not support batched constants."); + } + return Status::OK(); + }; TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(0), inputs.at(1))); TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(1), inputs.at(0))); @@ -5017,12 +5006,12 @@ Status ConvertGraphDefToEngine( for (const auto& node_def : gdef.node()) { string node_name = node_def.name(); VLOG(2) << "Converting op name=" << node_name << ", op=" << node_def.op(); - if (IsEngineInput(node_name)){ + if (IsEngineInput(node_name)) { int32 slot_number = -1; string type_key; if (node_def.op() == "Placeholder") { if (!strings::safe_strto32( // non-absl ok - node_name.c_str() + strlen(kInputPHName), &slot_number)) { + node_name.c_str() + strlen(prefixes.kInputPHName), &slot_number)) { return errors::InvalidArgument("Failed to parse slot number from ", node_name); } @@ -5033,7 +5022,11 @@ Status ConvertGraphDefToEngine( slot_number = node_def.attr().at("index").i(); type_key = "T"; } else { - return errors::InvalidArgument("Node ", node_name, " with name starting with kInputPHName is neither Placeholder nor Arg, instead ", node_def.op()); + return errors::InvalidArgument("Node ", node_name, + " with name starting with kInputPHName " + "is neither Placeholder nor Arg, " + "instead ", + node_def.op()); } nvinfer1::DataType trt_dtype; nvinfer1::Dims trt_dims; @@ -5060,14 +5053,17 @@ Status ConvertGraphDefToEngine( int32 slot_number = -1; if (node_def.op() == "Identity") { if (!strings::safe_strto32( // non-absl ok - node_name.c_str() + strlen(kOutputPHName), &slot_number)) { + node_name.c_str() + strlen(prefixes.kOutputPHName), &slot_number)) { return errors::InvalidArgument("Failed to parse slot number from ", node_name); } } else if (tensorflow::grappler::IsRetval(node_def)) { slot_number = node_def.attr().at("index").i(); } else { - return errors::InvalidArgument("Node with name ", node_name, " starting with kOutputPHName is neither Identity nor Retval, instead ", node_def.op()); + return errors::InvalidArgument("Node with name ", node_name, + " starting with prefixes.kOutputPHName is " + "neither Identity nor Retval, instead ", + node_def.op()); } // Get output type that TensorFlow expects TFAttrs attrs(node_def); @@ -5136,7 +5132,7 @@ Status ConvertSegmentToGraphDef( // Add dummy input/output nodes to the segment graphdef. if (connection.is_input_edge) { - const string node_name = StrCat(kInputPHName, connection.port_number); + const string node_name = StrCat(prefixes.kInputPHName, connection.port_number); if (marker_nodes.count(node_name)) { VLOG(1) << "Reusing input " << node_name << " for the edge " << connection.outside_node_name << ":" @@ -5155,7 +5151,7 @@ Status ConvertSegmentToGraphDef( << " -> " << connection.inside_node_name << ":" << connection.inside_port; } else { - const string node_name = StrCat(kOutputPHName, connection.port_number); + const string node_name = StrCat(prefixes.kOutputPHName, connection.port_number); if (marker_nodes.count(node_name)) { VLOG(1) << "Reusing output " << node_name << " for the edge " << connection.inside_node_name << ":" << connection.inside_port @@ -5194,7 +5190,7 @@ Status ConvertSegmentToGraphDef( auto snode = segment_def->mutable_node(old_to_new_id_map[connection.inside_id]); const string placeholder_name = - StrCat(kInputPHName, connection.port_number); + StrCat(prefixes.kInputPHName, connection.port_number); VLOG(1) << "Updating " << snode->name() << ":" << connection.inside_port << " from " << snode->input(connection.inside_port) << " to " << placeholder_name; diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h index a6a7afe121e..9dfe8ed3b1d 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h @@ -38,8 +38,6 @@ limitations under the License. namespace tensorflow { namespace tensorrt { -extern const char* const kInputPHName; -extern const char* const kOutputPHName; namespace convert { @@ -51,6 +49,8 @@ namespace convert { (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \ NV_TENSORRT_PATCH == patch && NV_TENSORRT_BUILD >= build)) +extern const IONamePrefixes prefixes = IONamePrefixes(); + struct EngineConnection { // Constructs a non-control edge. EngineConnection(const string& outside, int out_id, int out_port, diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h index 91c8c660f85..981c182311b 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/utils.h +++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h @@ -23,6 +23,14 @@ limitations under the License. namespace tensorflow { namespace tensorrt { +class IONamePrefixes { + public: + static constexpr const char* const kInputPHName = "TensorRTInputPH_"; + static constexpr const char* const kOutputPHName = "TensorRTOutputPH_"; + static constexpr const char* const kInputPHNameLower = "tensorrtinputph_"; + static constexpr const char* const kOutputPHNameLower = "tensorrtoutputph_"; +}; + template struct TrtDestroyer { void operator()(T* t) { diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index a329c8c6d78..7dc7931f15b 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -22,10 +22,10 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h" +#include "tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h" -#include "tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/graph_to_functiondef.h" #include "tensorflow/core/framework/op.h" @@ -55,6 +55,9 @@ using ::stream_executor::port::StatusOr; // A helper class to call done() when destructed for asynchronous execution. // Helps simultaneous execution of native and TRT engines. + +auto prefixes = IONamePrefixes(); + class AsyncHelper : public core::RefCounted { public: AsyncHelper(AsyncOpKernel::DoneCallback done) : done_(done) {} @@ -239,16 +242,7 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) OP_REQUIRES_OK(context, context->GetAttr("workspace_size_bytes", &workspace_size_)); OP_REQUIRES_OK(context, context->GetAttr("static_engine", &static_engine_)); - /*if (!static_engine_) { - OP_REQUIRES(context, segment_graph_.ParseFromString(serialized_segment_), - errors::InvalidArgument("Failed to parse segment graphdef!")); - VLOG(1) << "Size of serialized GraphDef: " - << serialized_segment_.capacity(); - string tmp; - // Swap with temporary empty string to deallocate the CPU memory. - serialized_segment_.swap(tmp); - }*/ - + VLOG(1) << "Constructing " << name(); string precision_string; OP_REQUIRES_OK(context, @@ -266,8 +260,9 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) if (!static_engine_) { OP_REQUIRES_OK(context, ConstructFunctionHandle(context)); FunctionLibraryRuntime* lib = context->function_library(); - OP_REQUIRES_OK(context, FunctionDefToGraphDef(native_func_, lib, &segment_graph_, - &input_node_ids_, &output_node_ids_)); + OP_REQUIRES_OK(context, + FunctionDefToGraphDef(native_func_, lib, &segment_graph_, + &input_node_ids_, &output_node_ids_)); } calibration_mode_ = (use_calibration_ && precision_mode_ == TrtPrecisionMode::INT8 && @@ -325,13 +320,12 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx, core::ScopedUnref unref_cache_res(cache_res); TRTCalibrationResource* calib_res = nullptr; OP_REQUIRES_OK_ASYNC( - ctx, - ctx->resource_manager()->LookupOrCreate( - std::string(kCalibrationContainerName), name(), - reinterpret_cast(&calib_res), - {[ctx, cache_res, this](TRTCalibrationResource** cr) -> Status { - return this->AllocateCalibrationResources(ctx, cache_res, cr); - }}), + ctx, ctx->resource_manager()->LookupOrCreate( + std::string(kCalibrationContainerName), name(), + reinterpret_cast(&calib_res), + {[ctx, this](TRTCalibrationResource** cr) -> Status { + return this->AllocateCalibrationResources(ctx, cr); + }}), *helper); core::ScopedUnref calib_sc(calib_res); int num_inputs = ctx->num_inputs(); @@ -349,9 +343,9 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx, const auto device_tensor = calib_res->device_tensors_.at(i).AccessTensor(ctx); CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes()); - input_data.emplace(StrCat(kInputPHName, - static_engine_ ? i : input_node_ids_[i]), - data_address); + input_data.emplace( + StrCat(prefixes.kInputPHName, static_engine_ ? i : input_node_ids_[i]), + data_address); } VLOG(2) << "Filled map for sending"; // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files @@ -430,9 +424,9 @@ Status TRTEngineOp::GetEngineInputShapes( // This should not happen, but just for safety. if (actual_input_shapes.size() != cached_input_shapes.size()) { return errors::InvalidArgument( - "Input shape list size mismatch for ", name(), - ", cached size: ", cached_input_shapes.size(), - " vs. actual size: ", actual_input_shapes.size()); + "Input shape list size mismatch for ", name(), ", cached size: ", + cached_input_shapes.size(), " vs. actual size: ", + actual_input_shapes.size()); } if (match_shapes(actual_input_shapes, cached_input_shapes)) { const int cached_batch_size = cached_input_shapes[0].dim_size(0); @@ -492,7 +486,8 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx, std::vector buffers(num_binding); for (int i = 0; i < ctx->num_inputs(); i++) { - const string input_name = StrCat(kInputPHName, static_engine_ ? i : input_node_ids_[i]); + const string input_name = + StrCat(prefixes.kInputPHName, static_engine_ ? i : input_node_ids_[i]); const int binding_index = cuda_engine->getBindingIndex(input_name.c_str()); if (binding_index == -1) { const string msg = @@ -534,7 +529,8 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx, for (int i = 0; i < ctx->num_outputs(); i++) { // Create an output tensor - const string output_name = StrCat(kOutputPHName, static_engine_ ? i : output_node_ids_[i]); + const string output_name = StrCat(prefixes.kOutputPHName, + static_engine_ ? i : output_node_ids_[i]); const int binding_index = cuda_engine->getBindingIndex(output_name.c_str()); Tensor* output_tensor = nullptr; @@ -763,7 +759,7 @@ Status TRTEngineOp::AllocateCalibrationResources( "Unsupported data type encountered in input ", i); } cres->device_buffers_.emplace( - StrCat(kInputPHName, static_engine_ ? i : input_node_ids_[i]), + StrCat(prefixes.kInputPHName, static_engine_ ? i : input_node_ids_[i]), std::pair(device_address, device_tensor->TotalBytes())); } cres->calibrator_.reset( diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc index dc31e5c156e..4eef454f8f3 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc @@ -61,8 +61,6 @@ class TRTEngineOpTestBase : public OpsTestBase { // Serialize the graph. TRTEngineOp will convert it using dynamic mode. GraphDef graph_def; TF_ASSERT_OK(s.ToGraphDef(&graph_def)); - /* - */ const string func_name = "myop_native_segment"; Graph* graph = s.graph(); Graph sgraph(graph->flib_def()); @@ -70,30 +68,17 @@ class TRTEngineOpTestBase : public OpsTestBase { graph, graph_def, &sgraph)); TF_ASSERT_OK(convert::RegisterModifiedGraphToFunctionLibrary(&sgraph, graph, flib_def_->ToProto(), "myop")); - //TF_ASSERT_OK(convert::RegisterSegmentFunctionToFunctionLibrary(graph, graph_def, "myop")); - - //FunctionDefLibrary fdeflib; - //auto native_segment = fdeflib.add_function(); - - //GraphToFunctionDef(*graph, func_name, native_segment); - /*(*native_segment - ->mutable_attr())[FunctionLibraryDefinition::kIntsOnDeviceAttr] - .set_b(true); - */ - - //graph->AddFunctionLibrary(fdeflib); PartialTensorShape shape({-1, -1}); - // Create the op. OpsTestBase::SetDevice(DEVICE_GPU, std::move(device)); TF_ASSERT_OK(NodeDefBuilder("myop", "TRTEngineOp") .Input(FakeInput(1, dtype)) .Attr("input_shapes", {shape}) .Attr("output_shapes", {shape}) .Attr("static_engine", false) - .Attr("segment_funcdef_name", func_name) // no native fallback - .Attr("serialized_segment", "")//graph_def.SerializeAsString()) + .Attr("segment_funcdef_name", func_name) + .Attr("serialized_segment", "") .Attr("calibration_data", "") .Attr("max_cached_engines_count", max_cached_engines_count) .Attr("workspace_size_bytes", 1 << 20) diff --git a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc index af76d84b232..13457ba5fd2 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc +++ b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc @@ -14,37 +14,32 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h" -//#include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" -#include "tensorflow/core/common_runtime/graph_optimizer.h" +#include "absl/strings/ascii.h" +#include "absl/strings/str_cat.h" #include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/common_runtime/graph_optimizer.h" #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/graph/algorithm.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/platform/logging.h" -#include "absl/strings/str_cat.h" -#include "absl/strings/ascii.h" namespace tensorflow { namespace tensorrt { -const char* const kInputPHName = "TensorRTInputPH_"; -const char* const kOutputPHName = "TensorRTOutputPH_"; -const char* const kInputPHNameLower = "tensorrtinputph_"; -const char* const kOutputPHNameLower = "tensorrtoutputph_"; +auto prefixes = IONamePrefixes(); -string NewNameWithIOPrefix(const Node* n) { - if (absl::StartsWith(n->name(), kInputPHNameLower)){ - return strings::StrCat(kInputPHName, n->id()); - } - else if (absl::StartsWith(n->name(), kOutputPHNameLower)) { - return strings::StrCat(kOutputPHName, n->id()); +string AppendIdToNodeName(const Node* n) { + if (absl::StartsWith(n->name(), prefixes.kInputPHNameLower)) { + return strings::StrCat(prefixes.kInputPHName, n->id()); + } else if (absl::StartsWith(n->name(), prefixes.kOutputPHNameLower)) { + return strings::StrCat(prefixes.kOutputPHName, n->id()); } return strings::StrCat("n", n->id()); } void ToGraphDefWithIOPrefix(const Graph* g, GraphDef* gdef) { // This is the same function as in function.cc. However, it uses the - // NewName mapping above, which retains IO prefixes (kInputPHName etc) + // name mapping above, which retains IO prefixes (prefixes.kInputPHName etc) gtl::InlinedVector inputs; gdef->Clear(); *gdef->mutable_versions() = g->versions(); @@ -59,7 +54,7 @@ void ToGraphDefWithIOPrefix(const Graph* g, GraphDef* gdef) { ReverseDFSFrom(*g, start_nodes, nullptr, [gdef, &inputs](Node* n) { if (!n->IsOp()) return; NodeDef* ndef = gdef->add_node(); - ndef->set_name(NewNameWithIOPrefix(n)); + ndef->set_name(AppendIdToNodeName(n)); ndef->set_op(n->type_string()); for (const auto& attr : n->attrs()) { (*ndef->mutable_attr())[attr.first] = attr.second; @@ -93,7 +88,7 @@ void ToGraphDefWithIOPrefix(const Graph* g, GraphDef* gdef) { ndef->add_input("unknown"); continue; } - const string srcname = NewNameWithIOPrefix(e->src()); + const string srcname = AppendIdToNodeName(e->src()); if (!e->src()->IsOp()) { } else if (e->IsControlEdge()) { ndef->add_input(strings::StrCat("^", srcname)); @@ -108,52 +103,33 @@ void ToGraphDefWithIOPrefix(const Graph* g, GraphDef* gdef) { Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, FunctionLibraryRuntime* flib_runtime, - GraphDef* graph_def, + GraphDef* graph_def, std::vector* input_node_ids, std::vector* output_node_ids) { - const FunctionLibraryDefinition* flib_def = flib_runtime->GetFunctionLibraryDefinition(); + const FunctionLibraryDefinition* flib_def = + flib_runtime->GetFunctionLibraryDefinition(); const FunctionBody* fbody; fbody = flib_runtime->GetFunctionBody(handle); - //TF_RET_CHECK(*fbody) + if (!fbody) { + return errors::Internal( + "Function body is null when converting from FuncDef to GraphDef."); + } std::unique_ptr graph(new Graph(flib_def)); - + CopyGraph(*fbody->graph, graph.get()); - // Copied from compiler/xla/compile_xla.cc : - /* - OptimizerOptions opts; - opts.set_opt_level(OptimizerOptions::L0); - opts.set_do_common_subexpression_elimination(false); - opts.set_do_function_inlining(true); - opts.set_do_constant_folding(true); - GraphOptimizer optimizer(opts); - auto cf_consider_fn = [](const Node* n) { - for (const auto& output_arg : n->op_def().output_arg()) { - if (output_arg.type() == DT_VARIANT) { - return false; - } - } - return true; - }; - GraphOptimizer::Options graph_optimizer_options; - graph_optimizer_options.cf_consider_fn = cf_consider_fn; - - */ - //optimizer.Optimize(flib_runtime, flib_runtime->env(), - // /*device=*/nullptr, &graph, graph_optimizer_options); - for (Node* n : graph->nodes()) { auto id = n->id(); if (n->IsArg()) { - VLOG(1) << "Arg Node id " << id; + VLOG(2) << "Arg Node id used for unique naming is " << id; input_node_ids->push_back(id); } if (n->IsRetval()) { - VLOG(1) << "Retval Node id " << id; + VLOG(2) << "Retval Node id used for unique naming is " << id; output_node_ids->push_back(id); } } - + ToGraphDefWithIOPrefix(graph.release(), graph_def); for (const auto node_def : graph_def->node()) { @@ -161,8 +137,6 @@ Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, } return Status::OK(); - -} - +} } } diff --git a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h index ffc702679e0..6acc21242a1 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h +++ b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_FUNCDEF_TO_GRAPHDEF_H_ #define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_FUNCDEF_TO_GRAPHDEF_H_ +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/framework/function.h" @@ -26,16 +27,18 @@ namespace tensorflow { namespace tensorrt { -string NewNameWithIOPrefix(const Node* n); +string AppendIdToNodeName(const Node* n); + void ToGraphDefWithIOPrefix(const Graph* g, GraphDef* gdef); + Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, FunctionLibraryRuntime* flib_runtime, GraphDef* graph_def, - std::vector* input_node_ids, - std::vector* output_node_ids); + std::vector* input_node_ids, + std::vector* output_node_ids); -} // namespace tensorrt -} // namespace tensorflow +} // namespace tensorrt +} // namespace tensorflow #endif #endif diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py index a41f965573a..6627c3788a4 100644 --- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py +++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py @@ -562,9 +562,7 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): self.assertNotEmpty(segment_funcdef_name, node.name) self.assertIn(function_name, functions) else: - #self.assertEmpty(segment_funcdef_name, node.name) self.assertTrue(len(node.attr["serialized_segment"].s), node.name) - #self.assertNotIn(function_name, functions) self.assertIn(node.name, expected_engines) self.assertEqual( self._ToBytes(run_params.precision_mode), diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py index cdd24ce041e..b8376a5ca65 100644 --- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py +++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py @@ -449,8 +449,6 @@ class TrtConvertTest(test_util.TensorFlowTestCase): except errors.OpError as e: # This should happen only when fallback path is disabled and TRT engine # fails to run. - # TODO(phillip-kravtsov) Check what correct handling is - #self.assertTrue(not use_function_backup and not expect_engine_is_run) self.assertIn("Fallback path is disabled, for TRTEngineOp_0", str(e)) @test_util.deprecated_graph_mode_only From fa1e3924c6841409790015106a04ad73c0c1f6cd Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Tue, 16 Jul 2019 20:45:50 -0700 Subject: [PATCH 0120/3053] Removed duplicate function in trt_engine_op.cc --- .../tf2tensorrt/kernels/trt_engine_op.cc | 41 +++++-------------- 1 file changed, 11 insertions(+), 30 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index 7dc7931f15b..c28436a7fea 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -95,9 +95,9 @@ class TRTEngineOp : public AsyncOpKernel { // Construct a function handle for executing native funcdef graph // These are the exact same function. - Status ConstructFunctionHandle(OpKernelContext* ctx); - Status ConstructFunctionHandle(OpKernelConstruction* ctx); + Status ConstructFunctionHandle(FunctionLibraryRuntime* lib, + const string& device_name); // Execute replaced native segment as function Op. void ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper); @@ -192,9 +192,10 @@ void* GetTensorAddress(const Tensor* tensor_ptr) { } } -Status TRTEngineOp::ConstructFunctionHandle(OpKernelContext* ctx) { +Status TRTEngineOp::ConstructFunctionHandle(FunctionLibraryRuntime* lib, + const string& device_name) { VLOG(1) << "Constructing function handle"; - auto lib = ctx->function_library(); + // auto lib = ctx->function_library(); if (lib == nullptr) { return errors::Internal("Context function library is null"); } @@ -205,30 +206,7 @@ Status TRTEngineOp::ConstructFunctionHandle(OpKernelContext* ctx) { } FunctionLibraryRuntime::InstantiateOptions inst_ops; inst_ops.state_handle = ""; - inst_ops.target = ctx->device()->name(); - native_func_ = 0; - return lib->Instantiate(funcdef_name_, AttrSlice(&fdef->attr()), inst_ops, - &native_func_); -} - -Status TRTEngineOp::ConstructFunctionHandle(OpKernelConstruction* ctx) { - VLOG(1) << "Constructing function handle"; - auto lib = ctx->function_library(); - if (lib == nullptr) { - return errors::Internal("Context function library is null"); - } - auto func_names = lib->GetFunctionLibraryDefinition()->ListFunctionNames(); - for (auto func_name : func_names) { - VLOG(2) << "Func name: " << func_name; - } - auto fdef = lib->GetFunctionLibraryDefinition()->Find(funcdef_name_); - if (fdef == nullptr) { - return errors::Internal("Native FunctionDef ", funcdef_name_, - " can't be found in function library"); - } - FunctionLibraryRuntime::InstantiateOptions inst_ops; - inst_ops.state_handle = ""; - inst_ops.target = ctx->device()->name(); + inst_ops.target = device_name; native_func_ = 0; return lib->Instantiate(funcdef_name_, AttrSlice(&fdef->attr()), inst_ops, &native_func_); @@ -258,7 +236,8 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) context->GetAttr("use_calibration", &use_calibration_)); native_func_ = kInvalidHandle; if (!static_engine_) { - OP_REQUIRES_OK(context, ConstructFunctionHandle(context)); + OP_REQUIRES_OK(context, ConstructFunctionHandle(context->function_library(), + context->device()->name())); FunctionLibraryRuntime* lib = context->function_library(); OP_REQUIRES_OK(context, FunctionDefToGraphDef(native_func_, lib, &segment_graph_, @@ -283,7 +262,9 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx, std::vector inputs; std::vector* outputs = new std::vector(); if (native_func_ == kInvalidHandle) { - OP_REQUIRES_OK_ASYNC(ctx, ConstructFunctionHandle(ctx), *helper); + OP_REQUIRES_OK_ASYNC(ctx, ConstructFunctionHandle(ctx->function_library(), + ctx->device()->name()), + *helper); } auto lib = ctx->function_library(); FunctionLibraryRuntime::Options opts; From 4325cb35f179c78c7e2db1ee01f87e89ef0fc45f Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 17 Jul 2019 14:38:27 +0000 Subject: [PATCH 0121/3053] Fix api compatibility test Signed-off-by: Yong Tang --- tensorflow/tools/api/golden/v1/tensorflow.pbtxt | 2 +- tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt | 2 +- tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt index 178daad4a2a..303de4a2d6d 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt @@ -2210,7 +2210,7 @@ tf_module { } member_method { name: "sparse_tensor_to_dense" - argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], " + argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], " } member_method { name: "sparse_to_dense" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt index 1fc79d509a9..27c64f2cbf7 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt @@ -126,7 +126,7 @@ tf_module { } member_method { name: "to_dense" - argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], " + argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], " } member_method { name: "to_indicator" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt index 96e05c6ea4a..da3149947b3 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt @@ -102,7 +102,7 @@ tf_module { } member_method { name: "to_dense" - argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], " + argspec: "args=[\'sp_input\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\'], " } member_method { name: "to_indicator" From 99968f53bce4faee500ffaa3f1e67f2bac7152c1 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Wed, 17 Jul 2019 09:37:02 -0700 Subject: [PATCH 0122/3053] Removed commented out code. --- tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc | 1 - tensorflow/compiler/tf2tensorrt/convert/convert_graph.h | 2 -- 2 files changed, 3 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index 74d4da6df73..4c9c3d103c7 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -66,7 +66,6 @@ using absl::StrCat; namespace { -//auto prefixes = IONamePrefixes(); Status BuildNodeMap(const Graph& graph, std::unordered_map* node_map) { diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h index b4f3849a93a..b40bc2ecf9b 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h @@ -32,8 +32,6 @@ namespace tensorflow { namespace tensorrt { namespace convert { -// extern const IONamePrefixes prefixes; - struct ConversionParams { const GraphDef* input_graph_def = nullptr; const std::vector* output_names = nullptr; From 161895847bb57c7a62ee54f63ad5c7dcb0c8ec8d Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 17 Jul 2019 15:32:26 -0700 Subject: [PATCH 0123/3053] Clean up the lock&tmp files when needed --- .../core/kernels/data/cache_dataset_ops.cc | 34 ++++++++++++++----- .../python/data/kernel_tests/cache_test.py | 14 ++++++++ 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc index 9b1fed90463..750ebc52462 100644 --- a/tensorflow/core/kernels/data/cache_dataset_ops.cc +++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc @@ -215,6 +215,19 @@ class CacheDatasetOp::FileDataset : public DatasetBase { lockfile_created_(false), iteration_completed_(false) {} + ~FileWriterIterator() { + if (!dataset()->env_->FileExists(MetaFilename(filename_)).ok()) { + std::vector cache_files; + dataset() + ->env_ + ->GetMatchingPaths(strings::StrCat(filename_, "*"), &cache_files) + .IgnoreError(); + for (const string& path : cache_files) { + dataset()->env_->DeleteFile(path).IgnoreError(); + } + } + } + Status Initialize(IteratorContext* ctx) override { return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_); } @@ -275,6 +288,9 @@ class CacheDatasetOp::FileDataset : public DatasetBase { Status SaveInternal(IteratorStateWriter* writer) override { mutex_lock l(mu_); + TF_RETURN_IF_ERROR( + writer->WriteScalar(full_name(kCurIndex), cur_index_)); + if (iteration_completed_) { TF_RETURN_IF_ERROR( writer->WriteScalar(full_name(kIterationCompleted), "")); @@ -301,8 +317,6 @@ class CacheDatasetOp::FileDataset : public DatasetBase { lockfile_created_ = false; } TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_)); - TF_RETURN_IF_ERROR( - writer->WriteScalar(full_name(kCurIndex), cur_index_)); TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kShardId), shard_id_)); return Status::OK(); } @@ -310,12 +324,6 @@ class CacheDatasetOp::FileDataset : public DatasetBase { Status RestoreInternal(IteratorContext* ctx, IteratorStateReader* reader) override { mutex_lock l(mu_); - if (reader->Contains(full_name(kIterationCompleted))) { - iteration_completed_ = true; - return Status::OK(); - } - - TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_)); int64 temp; // TODO(b/78048575): Update this when saving size_t tensors directly // is supported. @@ -326,6 +334,14 @@ class CacheDatasetOp::FileDataset : public DatasetBase { return errors::Internal("Invalid value for cur_index ", temp); } } + + if (reader->Contains(full_name(kIterationCompleted))) { + iteration_completed_ = true; + return Status::OK(); + } + + TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_)); + // TODO(b/78048575): Update this when saving size_t tensors directly // is supported. { @@ -409,7 +425,7 @@ class CacheDatasetOp::FileDataset : public DatasetBase { // Merge all the bundles. // Currently there are `shard_id_ + 1` bundles, one for each // checkpoint. Each bundle has prefix _ where `id` is an - // integer starting at 0 an incremented by 1 for each new checkpoint. + // integer starting at 0 and incremented by 1 for each new checkpoint. // We merge all these bundles into a bundle with prefix so // that the next call to `MakeIterator` can build a // `FileReaderIterator`. diff --git a/tensorflow/python/data/kernel_tests/cache_test.py b/tensorflow/python/data/kernel_tests/cache_test.py index 305092c4ba0..b1e884ec7ba 100644 --- a/tensorflow/python/data/kernel_tests/cache_test.py +++ b/tensorflow/python/data/kernel_tests/cache_test.py @@ -25,6 +25,7 @@ import numpy as np from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors @@ -170,6 +171,19 @@ class FileCacheTest(test_base.DatasetTestBase): expected_output = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9]] * 2 self.assertDatasetProduces(dataset, expected_output) + def testCleaningUpCacheFiles(self): + def do_test(i): + dataset = dataset_ops.Dataset.range(10).cache(self.cache_prefix) + get_next = self.getNext(dataset) + for _ in range(i): + try: + self.evaluate(get_next()) + except errors.OutOfRangeError: + break + + if context.executing_eagerly(): + for i in [0, 3, 10, 12, 15]: + do_test(i) @test_util.run_all_in_graph_and_eager_modes class MemoryCacheTest(test_base.DatasetTestBase): From 80223ea8a01718df61891fc8a23645fc02829edc Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 17 Jul 2019 15:34:00 -0700 Subject: [PATCH 0124/3053] Enhance the tests for CacheDataOp C++ kernel --- .../kernels/data/cache_dataset_ops_test.cc | 26 ++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/kernels/data/cache_dataset_ops_test.cc b/tensorflow/core/kernels/data/cache_dataset_ops_test.cc index 812d719946f..91f202a1506 100644 --- a/tensorflow/core/kernels/data/cache_dataset_ops_test.cc +++ b/tensorflow/core/kernels/data/cache_dataset_ops_test.cc @@ -23,6 +23,19 @@ constexpr char kFileDatasetPrefix[] = "File"; constexpr char kMemoryDatasetPrefix[] = "Memory"; class CacheDatasetOpTest : public DatasetOpsTestBase { + public: + ~CacheDatasetOpTest() { + if (!filename_.empty()) { + std::vector cache_files; + device_->env() + ->GetMatchingPaths(strings::StrCat(filename_, "*"), &cache_files) + .IgnoreError(); + for (const string& path : cache_files) { + device_->env()->DeleteFile(path).IgnoreError(); + } + } + } + protected: // Creates `TensorSliceDataset` variant tensor from the input vector of // tensors. @@ -57,8 +70,13 @@ class CacheDatasetOpTest : public DatasetOpsTestBase { std::unique_ptr* context) { TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs)); TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context)); + TF_RETURN_IF_ERROR(ParseScalarArgument( + context->get(), CacheDatasetOp::kFileName, &filename_)); return Status::OK(); } + + private: + string filename_ = ""; }; struct TestCase { @@ -84,7 +102,7 @@ TestCase TestCase1() { /*expected_output_dtypes*/ {DT_INT64}, /*expected_output_shapes*/ {PartialTensorShape({3, 1})}, /*expected_cardinality*/ 3, - /*breakpoints*/ {0, 4, 11}}; + /*breakpoints*/ {0, 2, 4, 11}}; } // Test case 2: cache empty data in file. @@ -96,7 +114,7 @@ TestCase TestCase2() { /*expected_output_dtypes*/ {DT_INT64}, /*expected_output_shapes*/ {PartialTensorShape({})}, /*expected_cardinality*/ 0, - /*breakpoints*/ {0, 4, 11}}; + /*breakpoints*/ {0, 2, 4, 11}}; } // Test case 3: cache data in memory. @@ -112,7 +130,7 @@ TestCase TestCase3() { /*expected_output_dtypes*/ {DT_INT64}, /*expected_output_shapes*/ {PartialTensorShape({3, 1})}, /*expected_cardinality*/ 3, - /*breakpoints*/ {0, 4, 11}}; + /*breakpoints*/ {0, 2, 4, 11}}; } // Test case 4: cache empty data in memory. @@ -124,7 +142,7 @@ TestCase TestCase4() { /*expected_output_dtypes*/ {DT_INT64}, /*expected_output_shapes*/ {PartialTensorShape({})}, /*expected_cardinality*/ 0, - /*breakpoints*/ {0, 4, 11}}; + /*breakpoints*/ {0, 2, 4, 11}}; } class ParameterizedCacheDatasetOpTest From 83f68f266a1c0c85a4104355b5014f58cff6d7a2 Mon Sep 17 00:00:00 2001 From: Vishnuvardhan Janapati <46058173+jvishnuvardhan@users.noreply.github.com> Date: Wed, 17 Jul 2019 16:20:59 -0700 Subject: [PATCH 0125/3053] Corrected a typo in CategorcialCrossEntropy Here is a [gist](https://colab.sandbox.google.com/gist/jvishnuvardhan/13a4de468dbb3853369b8c68caf521d1/pr_categorcialcrossentropy.ipynb) that show the corrected output after correcting typo. Thanks! --- tensorflow/python/keras/losses.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py index 2b2fd4f3c00..2f57d1696c9 100644 --- a/tensorflow/python/keras/losses.py +++ b/tensorflow/python/keras/losses.py @@ -419,8 +419,8 @@ class CategoricalCrossentropy(LossFunctionWrapper): cce = tf.keras.losses.CategoricalCrossentropy() loss = cce( [[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], - [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]]) - print('Loss: ', loss.numpy()) # Loss: 0.3239 + [[.9, .05, .05], [.05, .89, .06], [.05, .01, .94]]) + print('Loss: ', loss.numpy()) # Loss: 0.0945 ``` Usage with the `compile` API: From cc70f17486c0b5416bc2c5d5d6e9014d2f48004f Mon Sep 17 00:00:00 2001 From: Trent Lo Date: Wed, 17 Jul 2019 16:38:56 -0700 Subject: [PATCH 0126/3053] Add reallocation capability to bfc_allocator. This commit mitigates external fragmentation in bfc_allocator by reallocation. That is, although the sum of regions and unallocated bytes is larger than the requested bytes but the bfc_allocator still fails to allocate a large enough contiguous region to fulfill the request due to fragmentation. To avoid this case, a relocation feature is implemented to deallocate free regions so that a larger region can be formed. --- .../core/common_runtime/bfc_allocator.cc | 83 +++++++++++++++++++ .../core/common_runtime/bfc_allocator.h | 13 +++ .../gpu/gpu_bfc_allocator_test.cc | 45 ++++++++++ 3 files changed, 141 insertions(+) diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc index 62461cf7fae..80d653dbd8e 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.cc +++ b/tensorflow/core/common_runtime/bfc_allocator.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/bfc_allocator.h" #include +#include "absl/container/flat_hash_set.h" #include "tensorflow/core/common_runtime/allocator_retry.h" #include "tensorflow/core/lib/core/bits.h" @@ -260,6 +261,76 @@ size_t BFCAllocator::RoundedBytes(size_t bytes) { return rounded_bytes; } +bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes) { + // Searching for free regions. + absl::flat_hash_set free_region_ptrs; + size_t total_free_bytes = 0; + for (const auto& region : region_manager_.regions()) { + ChunkHandle h = region_manager_.get_handle(region.ptr()); + bool any_use = false; + while (h != kInvalidChunkHandle) { + const Chunk* c = ChunkFromHandle(h); + if (c->in_use()) { + any_use = true; + break; + } + h = c->next; + } + + if (!any_use) { + VLOG(2) << "Found free region with ptr = " << region.ptr(); + free_region_ptrs.insert(region.ptr()); + total_free_bytes += region.memory_size(); + } + } + + if (total_free_bytes == 0) { + return false; + } + + // Rough estimation to check whether deallocation can help. + size_t available_bytes = + memory_limit_ - total_region_allocated_bytes_ + total_free_bytes; + if (rounded_bytes > available_bytes) { + return false; + } + + VLOG(INFO) << "Re-allocate memory regions to avoid OOM due to memory" + << " fragmentation. If you see this message frequently, note" + << " that the re-allocation may incur performance overhead despite" + << " better memory utilization. You may try smaller batch sizes" + << " to see if it can give you better performance."; + + // Deallocate free regions. + auto it = region_manager_.regions().begin(); + while (it != region_manager_.regions().end()) { + if (!free_region_ptrs.contains(it->ptr())) { + ++it; + continue; + } + + VLOG(2) << "Deallocate region with ptr = " << it->ptr(); + // Remove all chunk registrations from Bins. + ChunkHandle h = region_manager_.get_handle(it->ptr()); + while (h != kInvalidChunkHandle) { + const Chunk* c = ChunkFromHandle(h); + if (c->bin_num != kInvalidBinNum) { + RemoveFreeChunkFromBin(h); + } + auto h_to_delete = h; + h = c->next; + DeleteChunk(h_to_delete); + } + + // Deallocate the memory. + sub_allocator_->Free(it->ptr(), it->memory_size()); + total_region_allocated_bytes_ -= it->memory_size(); + it = region_manager_.RemoveAllocationRegion(it); + } + + return true; +} + void* BFCAllocator::AllocateRawInternal(size_t unused_alignment, size_t num_bytes, bool dump_log_on_failure, @@ -307,6 +378,18 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment, } } + // Reaching this point means that no chunks can satisfy the request. Also, + // the unallocated bytes cannot satisfy the request. Before giving up, let's + // try deallocating free regions so that suballocator can combine them with + // the unallocated bytes and form a larger region. + if (DeallocateFreeRegions(rounded_bytes) && + Extend(unused_alignment, rounded_bytes)) { + ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before); + if (ptr != nullptr) { + return ptr; + } + } + // We searched all bins for an existing free chunk to use and // couldn't find one. This means we must have run out of memory, // Dump the memory log for analysis. diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h index bfd857a5e1b..040fe5ed88d 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.h +++ b/tensorflow/core/common_runtime/bfc_allocator.h @@ -309,6 +309,11 @@ class BFCAllocator : public Allocator { regions_.insert(entry, AllocationRegion(ptr, memory_size)); } + std::vector::const_iterator RemoveAllocationRegion( + std::vector::const_iterator it) { + return regions_.erase(it); + } + ChunkHandle get_handle(const void* p) const { return RegionFor(p)->get_handle(p); } @@ -354,6 +359,14 @@ class BFCAllocator : public Allocator { bool Extend(size_t alignment, size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_); + // Deallocate free regions to give back the memory to suballocator, so that + // we can re-allocate a larger region. The main use scenario of this function + // is when OOM happens but we have free regions and the sum of sizes of free + // regions and unallocated bytes is larger than the requested size, implying + // (external) memory fragmentation. Returns true if deallocating any free + // regions; false otherwise. + bool DeallocateFreeRegions(size_t rounded_bytes); + // Returns a pointer to an underlying allocated chunk of size // 'rounded_bytes'. void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes, diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc index 75d21d80dcb..f0518f34e79 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc @@ -568,6 +568,47 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test { EXPECT_EQ(GPUBFCAllocator::RoundedBytes(1LL << 31), force_no_allow_growth_allocator.curr_region_allocation_bytes_); } + + void TestRegionDeallocation() { + setenv("TF_FORCE_GPU_ALLOW_GROWTH", "unparseable", 1); + GPUOptions options; + options.set_allow_growth(true); + + // Max of 2GiB, but starts out small. + PlatformGpuId platform_gpu_id(0); + GPUMemAllocator* sub_allocator = new GPUMemAllocator( + GpuIdUtil::ExecutorForPlatformGpuId(platform_gpu_id).ValueOrDie(), + platform_gpu_id, /*use_unified_memory=*/false, {}, {}); + GPUBFCAllocator a(sub_allocator, 1LL << 31, options, "GPU_0_bfc"); + + // Allocate 128 raw pointers of 4 megs. + const size_t size = 1LL << 22; + std::vector initial_ptrs; + for (size_t s = 0; s < 128; s++) { + void* raw = a.AllocateRaw(1, size); + initial_ptrs.push_back(raw); + } + + // Make sure there are more than 1 regions in preparation for the test. + EXPECT_LT(1, a.region_manager_.regions().size()); + + // Deallocate all the memories except the last one. + for (size_t i = 0; i < initial_ptrs.size() - 1; i++) { + a.DeallocateRaw(initial_ptrs[i]); + } + + // Deallocate free regions and there shall be only one region left. + EXPECT_EQ(true, a.DeallocateFreeRegions(/*rounded_bytes=*/0)); + EXPECT_EQ(1, a.region_manager_.regions().size()); + + // There should be only one chunk left in bins. + size_t num_chunks_in_bins = 0; + for (int i = 0; i < BFCAllocator::kNumBins; i++) { + BFCAllocator::Bin* bin = a.BinFromIndex(i); + num_chunks_in_bins += bin->free_chunks.size(); + } + EXPECT_EQ(1, num_chunks_in_bins); + } }; TEST_F(GPUBFCAllocatorPrivateMethodsTest, BinDebugInfo) { TestBinDebugInfo(); } @@ -580,6 +621,10 @@ TEST_F(GPUBFCAllocatorPrivateMethodsTest, ForceAllowGrowth) { TestForceAllowGrowth(); } +TEST_F(GPUBFCAllocatorPrivateMethodsTest, TestRegionDeallocation) { + TestRegionDeallocation(); +} + } // namespace tensorflow #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM From f11e2451c61100ede00d92a9c33994af6e0c2e69 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Wed, 17 Jul 2019 16:57:48 -0700 Subject: [PATCH 0127/3053] Reverted unnecessary formatting adjustments. --- .../tf2tensorrt/convert/convert_graph.cc | 11 +- .../tf2tensorrt/convert/convert_nodes.cc | 119 ++++++++++-------- .../tf2tensorrt/kernels/trt_engine_op.cc | 6 +- 3 files changed, 73 insertions(+), 63 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index 4c9c3d103c7..f83513c07b2 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -49,9 +49,9 @@ limitations under the License. #include "tensorflow/core/lib/strings/numbers.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/types.h" -#include "tensorflow/core/protobuf/config.pb.h" // NOLINT +#include "tensorflow/core/protobuf/config.pb.h" // NOLINT #include "tensorflow/core/protobuf/device_properties.pb.h" // NOLINT -#include "tensorflow/core/protobuf/rewriter_config.pb.h" // NOLINT +#include "tensorflow/core/protobuf/rewriter_config.pb.h" // NOLINT #include "tensorflow/core/util/device_name_utils.h" #if GOOGLE_CUDA @@ -441,8 +441,6 @@ Status CreateTRTNode(const ConversionParams& params, TrtUniquePtrType engine_data(engine->serialize()); segment_string = string(static_cast(engine_data->data()), engine_data->size()); - } else { - segment_string = ""; } string prec_string; @@ -719,8 +717,9 @@ Status ConvertAfterShapes(const ConversionParams& params) { TrtNodeValidator validator(*params.graph_properties, params.precision_mode, params.use_calibration); TF_RETURN_IF_ERROR(segment::SegmentGraph( - &graph, std::bind(&TrtNodeValidator::IsTensorRTCandidate, &validator, - std::placeholders::_1), + &graph, + std::bind(&TrtNodeValidator::IsTensorRTCandidate, &validator, + std::placeholders::_1), // Input validation is already done by TrtNodeValidator, so we don't // need to check the input edges. [](const Edge* edge) { return true; }, OutputEdgeValidator(), diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc index 784b29470f6..7c10a1f5288 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc @@ -361,9 +361,9 @@ string DebugString(const nvinfer1::Permutation& permutation, int len) { string DebugString(const nvinfer1::ITensor& tensor) { return StrCat("nvinfer1::ITensor(@", reinterpret_cast(&tensor), - ", name=", tensor.getName(), ", dtype=", - DebugString(tensor.getType()), ", dims=", - DebugString(tensor.getDimensions()), ")"); + ", name=", tensor.getName(), + ", dtype=", DebugString(tensor.getType()), + ", dims=", DebugString(tensor.getDimensions()), ")"); } Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l, @@ -441,10 +441,11 @@ Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l, for (int i = 0; i < broadcast_num_dims; ++i) { if ((output_l[i] != output_r[i]) && (output_l[i] != 1) && (output_r[i] != 1)) { - return errors::InvalidArgument( - "Infeasible broadcast scheme (", "batch_dim: ", output_l[0], ", ", - DebugString(*operand_l_new_dims), " vs ", "batch_dim: ", - output_r[0], ", ", DebugString(*operand_r_new_dims), ")"); + return errors::InvalidArgument("Infeasible broadcast scheme (", + "batch_dim: ", output_l[0], ", ", + DebugString(*operand_l_new_dims), " vs ", + "batch_dim: ", output_r[0], ", ", + DebugString(*operand_r_new_dims), ")"); } } } @@ -712,8 +713,8 @@ size_t TRT_ShapedWeights::size_bytes() const { string TRT_ShapedWeights::DebugString() const { return StrCat("TRT_ShapedWeights(shape=", convert::DebugString(shape_), - ", type=", convert::DebugString(type_), ", values=", - reinterpret_cast(GetValues()), ")"); + ", type=", convert::DebugString(type_), + ", values=", reinterpret_cast(GetValues()), ")"); } // A fake ITensor implementation used to check whether the TF-TRT converter can @@ -982,8 +983,10 @@ OpConverterParams::OpConverterParams( use_calibration(converter->use_calibration()) {} const std::set* TrtNodeValidator::quantize_ops = new std::set{ - "QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3", - "FakeQuantWithMinMaxVars", "FakeQuantWithMinMaxArgs", + "QuantizeAndDequantizeV2", + "QuantizeAndDequantizeV3", + "FakeQuantWithMinMaxVars", + "FakeQuantWithMinMaxArgs", }; TrtNodeValidator::TrtNodeValidator( @@ -1062,9 +1065,9 @@ Status TrtNodeValidator::IsTensorRTCandidate(const Node* node) { Status status = ConvertToTensorOrWeights(src_def, edge->src_output(), &tensor_or_weights); if (!status.ok()) { - return errors::Internal("Failed to convert input ", src_def.name(), - " to a TRT_TensorOrWeights: ", - status.error_message()); + return errors::Internal( + "Failed to convert input ", src_def.name(), + " to a TRT_TensorOrWeights: ", status.error_message()); } inputs.push_back(tensor_or_weights); } @@ -1363,9 +1366,9 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input, // CreateConstantLayer. So we can treat it as a tensor for // AreDimsStaticWithDifferentSize(). This really only matters for 0-D tensors. if (AreDimsStaticWithDifferentSize(input_dims, dims, /*is_tensor=*/true)) { - return errors::InvalidArgument("Incompatible shapes: ", - DebugString(input_dims), " vs. ", - DebugString(dims)); + return errors::InvalidArgument( + "Incompatible shapes: ", DebugString(input_dims), " vs. ", + DebugString(dims)); } // ConstantLayer requires static shapes (cannot infer -1). if (input.is_weights() && !HasStaticShape(dims)) { @@ -1455,7 +1458,7 @@ void Converter::MaybeApplyQuantizationRanges() { // Infer ranges across marked ops. PropagateQuantizationRanges(); -// Apply ranges. + // Apply ranges. #if IS_TRT_VERSION_GE(5, 0, 0, 0) for (auto pair : quantization_ranges_) { nvinfer1::ITensor* tensor = pair.first; @@ -1507,20 +1510,27 @@ void Converter::MaybeApplyQuantizationRanges() { // Conv+Activation(Clip or Relu) are fused. std::set fused_tensors; typedef std::function matcher; - const std::vector>> fused_patterns = { - {"Fused Conv+Bias+Activation", - { + const std::vector>> fused_patterns = { + {"Fused Conv+Bias+Activation", + { + IsConvolution, + IsScale, + IsClipOrRelu, IsConvolution, IsScale, IsClipOrRelu, - }}, - {"Fused Conv+Bias", - { + }}, + {"Fused Conv+Bias", + { + IsConvolution, + IsScale, IsConvolution, IsScale, - }}, - {"Fused Conv+Activation", - { + }}, + {"Fused Conv+Activation", + { + IsConvolution, + IsClipOrRelu, IsConvolution, IsClipOrRelu, - }}, - }; + }}, + }; for (int i = 0; i < this->network()->getNbLayers(); i++) { for (const auto& pattern : fused_patterns) { size_t last_matcher = pattern.second.size() - 1; @@ -2098,11 +2108,11 @@ Status ConvertReshape(OpConverterParams* params) { << "\nreshape_batch_dim=" << reshape_batch_dim << ", reshape_dims=" << DebugString(reshape_dims); if (reshape_may_change_batch_dim) { - const string msg = - StrCat("Reshape on batch dimension is not supported, at ", - node_def.name(), ". input_batch_dim=", input_batch_dim, ", ", - DebugString(input_dims), "; reshape_batch_dim=", - reshape_batch_dim, ", ", DebugString(reshape_dims)); + const string msg = StrCat( + "Reshape on batch dimension is not supported, at ", node_def.name(), + ". input_batch_dim=", input_batch_dim, ", ", DebugString(input_dims), + "; reshape_batch_dim=", reshape_batch_dim, ", ", + DebugString(reshape_dims)); return errors::Unimplemented(msg); } @@ -2810,7 +2820,7 @@ Status ConvertActivation(OpConverterParams* params) { params->converter->network()->addActivation(*inputs.at(0).tensor(), op_pair->second); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); -// Set parameters. + // Set parameters. #if IS_TRT_VERSION_GE(5, 1, 2, 0) if (node_def.op() == "Elu") { layer->setAlpha(1.0f); @@ -4101,8 +4111,8 @@ Status ConvertGather(OpConverterParams* params) { if (trt_gather_output_dims.nbDims != expected_trt_output_rank) { return errors::Internal( "Get unexpected output dimensions of IGatherLayer. Expect nbDims: ", - expected_trt_output_rank, ", actual nbDims: ", - trt_gather_output_dims.nbDims); + expected_trt_output_rank, + ", actual nbDims: ", trt_gather_output_dims.nbDims); } // Reshape the output so after adding the implicit batch dim it'll match the // output shape of TF GatherV2. @@ -4201,9 +4211,8 @@ Status ConvertMatMulHelper(OpConverterParams* params, input_b.GetTrtDims().nbDims == 2; // If int8 is specified, FC must be used unless it is not compatible, as MM // does not support int8 at this time. - if (should_use_fc || - (can_use_fc && - params->converter->precision_mode() == TrtPrecisionMode::INT8)) { + if (should_use_fc || (can_use_fc && params->converter->precision_mode() == + TrtPrecisionMode::INT8)) { return ConvertFullyConnectedHelper( params, input_a.tensor(), input_b.weights(), transpose_b, node_name); } @@ -4219,8 +4228,9 @@ Status ConvertMatMulHelper(OpConverterParams* params, // If the MatMul operand is a constant, applies transposes at conversion-time // as necessary. If the operand is a tensor, does nothing. If required // transposes were applied, sets transpose to false. - const auto prepare_matmul_operand = [¶ms]( - TRT_TensorOrWeights operand, bool* transpose) -> nvinfer1::ITensor* { + const auto prepare_matmul_operand = + [¶ms](TRT_TensorOrWeights operand, + bool* transpose) -> nvinfer1::ITensor* { if (operand.is_tensor()) { return operand.tensor(); } else { @@ -4302,18 +4312,19 @@ Status ConvertBatchMatMul(OpConverterParams* params) { // Tensor with TF Dims: 12 5 3 -> TRT Dims: 5 3 // Weight with TF Dims: 12 3 6 -> TRT Dims: 12 3 6 // It is not possible to treat the weight input as a batched [3, 6] tensor. - const auto check_weight_is_not_batched = []( - const TRT_TensorOrWeights& input_l, const TRT_TensorOrWeights& input_r) { - // If input_l is a weight, then input_r must be a tensor because - // otherwise the op would be handled by Grappler. - if (input_l.is_weights() && - input_l.GetTrtDims().nbDims > input_r.GetTrtDims().nbDims && - input_l.GetTrtDims().d[0] != 1) { - return errors::Unimplemented( - "TensorRT does not support batched constants."); - } - return Status::OK(); - }; + const auto check_weight_is_not_batched = + [](const TRT_TensorOrWeights& input_l, + const TRT_TensorOrWeights& input_r) { + // If input_l is a weight, then input_r must be a tensor because + // otherwise the op would be handled by Grappler. + if (input_l.is_weights() && + input_l.GetTrtDims().nbDims > input_r.GetTrtDims().nbDims && + input_l.GetTrtDims().d[0] != 1) { + return errors::Unimplemented( + "TensorRT does not support batched constants."); + } + return Status::OK(); + }; TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(0), inputs.at(1))); TF_RETURN_IF_ERROR(check_weight_is_not_batched(inputs.at(1), inputs.at(0))); diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index c28436a7fea..53cc44b5a33 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -405,9 +405,9 @@ Status TRTEngineOp::GetEngineInputShapes( // This should not happen, but just for safety. if (actual_input_shapes.size() != cached_input_shapes.size()) { return errors::InvalidArgument( - "Input shape list size mismatch for ", name(), ", cached size: ", - cached_input_shapes.size(), " vs. actual size: ", - actual_input_shapes.size()); + "Input shape list size mismatch for ", name(), + ", cached size: ", cached_input_shapes.size(), + " vs. actual size: ", actual_input_shapes.size()); } if (match_shapes(actual_input_shapes, cached_input_shapes)) { const int cached_batch_size = cached_input_shapes[0].dim_size(0); From 0bf5d44c5d545b85cd53a4efcb659afa8c531ba8 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Wed, 17 Jul 2019 17:10:30 -0700 Subject: [PATCH 0128/3053] Removed rest of unnecessary formatting. --- .../tf2tensorrt/convert/convert_nodes.cc | 25 ++++++++----------- .../tf2tensorrt/kernels/trt_engine_op.cc | 13 +++++----- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc index 7c10a1f5288..3920dad6b48 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc @@ -1510,27 +1510,24 @@ void Converter::MaybeApplyQuantizationRanges() { // Conv+Activation(Clip or Relu) are fused. std::set fused_tensors; typedef std::function matcher; - const std::vector>> fused_patterns = { - {"Fused Conv+Bias+Activation", - { + const std::vector>> fused_patterns = { + {"Fused Conv+Bias+Activation", + { IsConvolution, IsScale, IsClipOrRelu, - IsConvolution, IsScale, IsClipOrRelu, - }}, - {"Fused Conv+Bias", - { + }}, + {"Fused Conv+Bias", + { IsConvolution, IsScale, - IsConvolution, IsScale, - }}, - {"Fused Conv+Activation", - { + }}, + {"Fused Conv+Activation", + { IsConvolution, IsClipOrRelu, - IsConvolution, IsClipOrRelu, - }}, - }; + }}, + }; for (int i = 0; i < this->network()->getNbLayers(); i++) { for (const auto& pattern : fused_patterns) { size_t last_matcher = pattern.second.size() - 1; diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index 53cc44b5a33..6fccdaa4fe9 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -301,12 +301,13 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx, core::ScopedUnref unref_cache_res(cache_res); TRTCalibrationResource* calib_res = nullptr; OP_REQUIRES_OK_ASYNC( - ctx, ctx->resource_manager()->LookupOrCreate( - std::string(kCalibrationContainerName), name(), - reinterpret_cast(&calib_res), - {[ctx, this](TRTCalibrationResource** cr) -> Status { - return this->AllocateCalibrationResources(ctx, cr); - }}), + ctx, + ctx->resource_manager()->LookupOrCreate( + std::string(kCalibrationContainerName), name(), + reinterpret_cast(&calib_res), + {[ctx, cache_res, this](TRTCalibrationResource** cr) -> Status { + return this->AllocateCalibrationResources(ctx, cache_res, cr); + }}), *helper); core::ScopedUnref calib_sc(calib_res); int num_inputs = ctx->num_inputs(); From e8e351585beb928183a1fff7c0f053a7438527c0 Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Thu, 18 Jul 2019 16:58:06 +0200 Subject: [PATCH 0129/3053] Modularized gaussian noises' dtype. Enforced the adjustment of gaussian noises' dtype with that of the inputs. This fixes issue #30834 --- tensorflow/python/keras/layers/noise.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/keras/layers/noise.py b/tensorflow/python/keras/layers/noise.py index f230d23c15a..4ef357664fd 100644 --- a/tensorflow/python/keras/layers/noise.py +++ b/tensorflow/python/keras/layers/noise.py @@ -65,7 +65,9 @@ class GaussianNoise(Layer): def noised(): return inputs + K.random_normal( - shape=array_ops.shape(inputs), mean=0., stddev=self.stddev) + shape=array_ops.shape(inputs), mean=0., stddev=self.stddev, + dtype=inputs.dtype + ) return K.in_train_phase(noised, inputs, training=training) @@ -115,7 +117,9 @@ class GaussianDropout(Layer): def noised(): stddev = np.sqrt(self.rate / (1.0 - self.rate)) return inputs * K.random_normal( - shape=array_ops.shape(inputs), mean=1.0, stddev=stddev) + shape=array_ops.shape(inputs), mean=1.0, stddev=stddev, + dtype=inputs.dtype + ) return K.in_train_phase(noised, inputs, training=training) return inputs From 9a51992173794cb739b1216f590e894747fcc283 Mon Sep 17 00:00:00 2001 From: Gianluca Varisco Date: Thu, 18 Jul 2019 19:20:00 +0200 Subject: [PATCH 0130/3053] Update README.md - specify Arduino IDE version This commit specifies that the HOWTO described in the README applies to the Arduino *Desktop* IDE. Specific features, eg. Serial Plotter, are for the time being only available on the Desktop version. --- .../experimental/micro/examples/hello_world/README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tensorflow/lite/experimental/micro/examples/hello_world/README.md b/tensorflow/lite/experimental/micro/examples/hello_world/README.md index 1de9730848c..ac131e70136 100644 --- a/tensorflow/lite/experimental/micro/examples/hello_world/README.md +++ b/tensorflow/lite/experimental/micro/examples/hello_world/README.md @@ -76,11 +76,11 @@ blink instead of fading. ### Obtain and import the library To use this sample application with Arduino, we've created an Arduino library -that includes it as an example that you can open in the Arduino IDE. +that includes it as an example that you can open in the Arduino Desktop IDE. Download the current nightly build of the library: [hello_world.zip](https://storage.googleapis.com/tensorflow-nightly/github/tensorflow/tensorflow/lite/experimental/micro/tools/make/gen/arduino_x86_64/prj/hello_world/hello_world.zip) -Next, import this zip file into the Arduino IDE by going to `Sketch -> Include Library -> Add .ZIP Library...`. +Next, import this zip file into the Arduino Desktop IDE by going to `Sketch -> Include Library -> Add .ZIP Library...`. #### Building the library @@ -98,7 +98,7 @@ A zip file will be created at the following location: tensorflow/lite/experimental/micro/tools/make/gen/arduino_x86_64/prj/hello_world/hello_world.zip ``` -You can then import this zip file into the Arduino IDE by going to `Sketch -> Include Library -> Add .ZIP Library...`. +You can then import this zip file into the Arduino Desktop IDE by going to `Sketch -> Include Library -> Add .ZIP Library...`. ### Load and run the example @@ -106,10 +106,10 @@ Once the library has been added, go to `File -> Examples`. You should see an example near the bottom of the list named `TensorFlowLite:hello_world`. Select it and click `hello_world` to load the example. -Use the Arduino IDE to build and upload the example. Once it is running, you +Use the Arduino Desktop IDE to build and upload the example. Once it is running, you should see the built-in LED on your device flashing. -The Arduino IDE includes a plotter that we can use to display the sine wave +The Arduino Desktop IDE includes a plotter that we can use to display the sine wave graphically. To view it, go to `Tools -> Serial Plotter`. You will see one datapoint being logged for each inference cycle, expressed as a number between 0 and 255. From a5d8c796b60a57d907494db8295b4102d68b4941 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Thu, 18 Jul 2019 11:07:47 -0700 Subject: [PATCH 0131/3053] Add the warning log when deleting lock/tmp files fail --- .../core/kernels/data/cache_dataset_ops.cc | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc index 750ebc52462..7e70385e9b0 100644 --- a/tensorflow/core/kernels/data/cache_dataset_ops.cc +++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc @@ -218,12 +218,18 @@ class CacheDatasetOp::FileDataset : public DatasetBase { ~FileWriterIterator() { if (!dataset()->env_->FileExists(MetaFilename(filename_)).ok()) { std::vector cache_files; - dataset() - ->env_ - ->GetMatchingPaths(strings::StrCat(filename_, "*"), &cache_files) - .IgnoreError(); + Status s = dataset()->env_->GetMatchingPaths( + strings::StrCat(filename_, "*"), &cache_files); + if (!s.ok()) { + LOG(WARNING) << "Failed to get matching files on " << filename_ + << "* : " << s.ToString(); + } for (const string& path : cache_files) { - dataset()->env_->DeleteFile(path).IgnoreError(); + s = dataset()->env_->DeleteFile(path); + if (!s.ok()) { + LOG(WARNING) << "Failed to delete " << path << " : " + << s.ToString(); + } } } } From 817976b48cf24c1167fba51c5801c0d9a82ce98f Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 10 Jul 2019 23:53:19 +0000 Subject: [PATCH 0132/3053] Adding no_rocm tag to unit-tests that fail on the ROCm platform --- tensorflow/python/compiler/xla/BUILD | 1 + tensorflow/python/keras/distribute/BUILD | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/tensorflow/python/compiler/xla/BUILD b/tensorflow/python/compiler/xla/BUILD index b4b540d51af..1e65273aa23 100644 --- a/tensorflow/python/compiler/xla/BUILD +++ b/tensorflow/python/compiler/xla/BUILD @@ -86,6 +86,7 @@ cuda_py_test( ], tags = [ "no_mac", + "no_rocm", # XLA support is not enabled on the ROCm platform "no_windows", ], xla_enable_strict_auto_jit = True, diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD index 045c273c2e2..2607fa774b5 100644 --- a/tensorflow/python/keras/distribute/BUILD +++ b/tensorflow/python/keras/distribute/BUILD @@ -105,6 +105,7 @@ distribute_py_test( shard_count = 5, tags = [ "multi_and_single_gpu", + "no_rocm", # times out on ROCm "no_windows_gpu", "notsan", ], @@ -165,6 +166,7 @@ distribute_py_test( shard_count = 19, tags = [ "multi_and_single_gpu", + "no_rocm", # times out on ROCm "no_windows_gpu", # TODO(b/134764123): Re-enable this test. "notap", @@ -184,6 +186,7 @@ distribute_py_test( shard_count = 4, tags = [ "multi_and_single_gpu", + "no_rocm", # times out on ROCm "no_windows_gpu", "notsan", ], @@ -201,6 +204,7 @@ distribute_py_test( shard_count = 8, tags = [ "multi_and_single_gpu", + "no_rocm", # times out on ROCm "no_windows_gpu", "notsan", ], From 2b50159ffe0e75230a4ac570d8d0627f640283a8 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 10 Jul 2019 23:45:00 +0000 Subject: [PATCH 0133/3053] fixing a couple of unit-test failures that were being caused because the (python) code was passing strings instead of bytes --- tensorflow/lite/python/convert.py | 13 ++++++++++++- tensorflow/lite/python/interpreter.py | 4 ++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py index ae1f8bb47f2..bf659c44e43 100644 --- a/tensorflow/lite/python/convert.py +++ b/tensorflow/lite/python/convert.py @@ -153,7 +153,18 @@ def toco_convert_protos(model_flags_str, fp_toco.write(toco_flags_str) fp_input.write(input_data_str) debug_info_str = debug_info_str if debug_info_str else "" - fp_debug.write(debug_info_str) + # if debug_info_str contains a "string value", then the call to + # fp_debug.write(debug_info_str) will fail with the following error + # + # TypeError: a bytes-like object is required, not 'str' + # + # Some of the subtests within the "convert_test" unit-test fail + # with the error shown above. So watch out for that scenario and + # convert debug_info_str to bytes where needed + if isinstance(debug_info_str, str): + fp_debug.write(debug_info_str.encode('utf-8')) + else: + fp_debug.write(debug_info_str) # Reserve an output file with _tempfile.NamedTemporaryFile(delete=False) as fp: diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py index f83a438f959..43b90883c8a 100644 --- a/tensorflow/lite/python/interpreter.py +++ b/tensorflow/lite/python/interpreter.py @@ -99,8 +99,8 @@ class Delegate(object): options_keys = (ctypes.c_char_p * len(options))() options_values = (ctypes.c_char_p * len(options))() for idx, (key, value) in enumerate(options.items()): - options_keys[idx] = str(key) - options_values[idx] = str(value) + options_keys[idx] = str(key).encode('utf-8') + options_values[idx] = str(value).encode('utf-8') class ErrorMessageCapture(object): From cc3533668fc67722a38462d738355ed89fcbcc76 Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Thu, 18 Jul 2019 15:29:42 -0700 Subject: [PATCH 0134/3053] Cudnn RNN V2 op is default under TF keras API --- tensorflow/python/keras/layers/cudnn_recurrent.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py index cec614f087a..c82eecb8d05 100644 --- a/tensorflow/python/keras/layers/cudnn_recurrent.py +++ b/tensorflow/python/keras/layers/cudnn_recurrent.py @@ -19,7 +19,6 @@ from __future__ import division from __future__ import print_function import collections -import os from tensorflow.python.framework import constant_op from tensorflow.python.keras import backend as K @@ -294,7 +293,6 @@ class CuDNNGRU(_CuDNNRNN): ], shape=self._vector_shape) - use_cudnn_v2 = os.environ.get("TF_CUDNN_RNN_USE_V2", "0") args = { "input": inputs, "input_h": input_h, @@ -304,10 +302,7 @@ class CuDNNGRU(_CuDNNRNN): "rnn_mode": 'gru', } - if use_cudnn_v2 != "1": - outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args) - else: - outputs, h, _, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args) + outputs, h, _, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args) if self.stateful or self.return_state: h = h[0] @@ -500,7 +495,6 @@ class CuDNNLSTM(_CuDNNRNN): ], shape=self._vector_shape) - use_cudnn_v2 = os.environ.get("TF_CUDNN_RNN_USE_V2", "0") args = { "input": inputs, "input_h": input_h, @@ -509,10 +503,7 @@ class CuDNNLSTM(_CuDNNRNN): "is_training": True, } - if use_cudnn_v2 != "1": - outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args) - else: - outputs, h, c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args) + outputs, h, c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv2(**args) if self.stateful or self.return_state: h = h[0] From 521fe01a50009fec4091ab4c674e1921cf188d87 Mon Sep 17 00:00:00 2001 From: Trent Lo Date: Thu, 18 Jul 2019 15:56:30 -0700 Subject: [PATCH 0135/3053] Some code refactory and polishing. --- .../core/common_runtime/bfc_allocator.cc | 26 ++++++++++++------- .../core/common_runtime/bfc_allocator.h | 8 ++++-- .../gpu/gpu_bfc_allocator_test.cc | 1 - 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc index 80d653dbd8e..da4828f114a 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.cc +++ b/tensorflow/core/common_runtime/bfc_allocator.cc @@ -16,7 +16,6 @@ limitations under the License. #include "tensorflow/core/common_runtime/bfc_allocator.h" #include -#include "absl/container/flat_hash_set.h" #include "tensorflow/core/common_runtime/allocator_retry.h" #include "tensorflow/core/lib/core/bits.h" @@ -265,7 +264,7 @@ bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes) { // Searching for free regions. absl::flat_hash_set free_region_ptrs; size_t total_free_bytes = 0; - for (const auto& region : region_manager_.regions()) { + for (const AllocationRegion& region : region_manager_.regions()) { ChunkHandle h = region_manager_.get_handle(region.ptr()); bool any_use = false; while (h != kInvalidChunkHandle) { @@ -295,16 +294,25 @@ bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes) { return false; } - VLOG(INFO) << "Re-allocate memory regions to avoid OOM due to memory" - << " fragmentation. If you see this message frequently, note" - << " that the re-allocation may incur performance overhead despite" - << " better memory utilization. You may try smaller batch sizes" - << " to see if it can give you better performance."; + VLOG(WARNING) << "Re-allocate memory regions (i.e., allocations) to avoid OOM" + << " due to memory fragmentation. If you see this message" + << " frequently, you are running near the threshold of the" + << " available device memory and it can incur great performance" + << " overhead. You may try smaller batch sizes to observe the" + << " performance impact. Alternatively you may try setting" + << " `allow_growth=false` in GPUOptions."; // Deallocate free regions. + DeallocateRegions(free_region_ptrs); + + return true; +} + +void BFCAllocator::DeallocateRegions( + const absl::flat_hash_set& region_ptrs) { auto it = region_manager_.regions().begin(); while (it != region_manager_.regions().end()) { - if (!free_region_ptrs.contains(it->ptr())) { + if (!region_ptrs.contains(it->ptr())) { ++it; continue; } @@ -327,8 +335,6 @@ bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes) { total_region_allocated_bytes_ -= it->memory_size(); it = region_manager_.RemoveAllocationRegion(it); } - - return true; } void* BFCAllocator::AllocateRawInternal(size_t unused_alignment, diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h index 040fe5ed88d..606527476ce 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.h +++ b/tensorflow/core/common_runtime/bfc_allocator.h @@ -23,6 +23,7 @@ limitations under the License. #include #include +#include "absl/container/flat_hash_set.h" #include "tensorflow/core/common_runtime/allocator_retry.h" #include "tensorflow/core/common_runtime/shared_counter.h" #include "tensorflow/core/framework/allocator.h" @@ -363,10 +364,13 @@ class BFCAllocator : public Allocator { // we can re-allocate a larger region. The main use scenario of this function // is when OOM happens but we have free regions and the sum of sizes of free // regions and unallocated bytes is larger than the requested size, implying - // (external) memory fragmentation. Returns true if deallocating any free - // regions; false otherwise. + // (external) memory fragmentation. Returns true if any free regions are + // found and freed; false otherwise. bool DeallocateFreeRegions(size_t rounded_bytes); + // Helper function to deallocate regions. + void DeallocateRegions(const absl::flat_hash_set& region_ptrs); + // Returns a pointer to an underlying allocated chunk of size // 'rounded_bytes'. void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes, diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc index f0518f34e79..a808ae7ff72 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator_test.cc @@ -570,7 +570,6 @@ class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test { } void TestRegionDeallocation() { - setenv("TF_FORCE_GPU_ALLOW_GROWTH", "unparseable", 1); GPUOptions options; options.set_allow_growth(true); From 2a8945e59a2ef459240291dc55a5cc63ad8b9daf Mon Sep 17 00:00:00 2001 From: Trent Lo Date: Thu, 18 Jul 2019 16:06:37 -0700 Subject: [PATCH 0136/3053] Minor tweak for the warning message. --- tensorflow/core/common_runtime/bfc_allocator.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc index da4828f114a..7189170365c 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.cc +++ b/tensorflow/core/common_runtime/bfc_allocator.cc @@ -297,10 +297,10 @@ bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes) { VLOG(WARNING) << "Re-allocate memory regions (i.e., allocations) to avoid OOM" << " due to memory fragmentation. If you see this message" << " frequently, you are running near the threshold of the" - << " available device memory and it can incur great performance" - << " overhead. You may try smaller batch sizes to observe the" - << " performance impact. Alternatively you may try setting" - << " `allow_growth=false` in GPUOptions."; + << " available device memory and re-allocation can incur great" + << " performance overhead. You may try smaller batch sizes to" + << " observe the performance impact. Alternatively you may try" + << " setting `allow_growth=false` in GPUOptions."; // Deallocate free regions. DeallocateRegions(free_region_ptrs); From 0d4a50fb9a63d059ade8d3edac0a382eac7d6a33 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Thu, 18 Jul 2019 16:27:15 -0700 Subject: [PATCH 0137/3053] Skip the test in the graph mode --- .../core/kernels/data/cache_dataset_ops_test.cc | 14 ++++++++++---- tensorflow/python/data/kernel_tests/cache_test.py | 9 ++++++--- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/tensorflow/core/kernels/data/cache_dataset_ops_test.cc b/tensorflow/core/kernels/data/cache_dataset_ops_test.cc index 91f202a1506..6fba6af6876 100644 --- a/tensorflow/core/kernels/data/cache_dataset_ops_test.cc +++ b/tensorflow/core/kernels/data/cache_dataset_ops_test.cc @@ -27,11 +27,17 @@ class CacheDatasetOpTest : public DatasetOpsTestBase { ~CacheDatasetOpTest() { if (!filename_.empty()) { std::vector cache_files; - device_->env() - ->GetMatchingPaths(strings::StrCat(filename_, "*"), &cache_files) - .IgnoreError(); + Status s = device_->env()->GetMatchingPaths( + strings::StrCat(filename_, "*"), &cache_files); + if (!s.ok()) { + LOG(WARNING) << "Failed to get matching files on " << filename_ + << "* : " << s.ToString(); + } for (const string& path : cache_files) { - device_->env()->DeleteFile(path).IgnoreError(); + s = device_->env()->DeleteFile(path); + if (!s.ok()) { + LOG(WARNING) << "Failed to delete " << path << " : " << s.ToString(); + } } } } diff --git a/tensorflow/python/data/kernel_tests/cache_test.py b/tensorflow/python/data/kernel_tests/cache_test.py index b1e884ec7ba..bef4ffb3837 100644 --- a/tensorflow/python/data/kernel_tests/cache_test.py +++ b/tensorflow/python/data/kernel_tests/cache_test.py @@ -181,9 +181,12 @@ class FileCacheTest(test_base.DatasetTestBase): except errors.OutOfRangeError: break - if context.executing_eagerly(): - for i in [0, 3, 10, 12, 15]: - do_test(i) + if not context.executing_eagerly(): + self.skipTest( + "Test requires eager mode for iterators to be deconstructed") + + for i in [0, 3, 10, 12, 15]: + do_test(i) @test_util.run_all_in_graph_and_eager_modes class MemoryCacheTest(test_base.DatasetTestBase): From 5017d0e422be6ea40b034bfce20485f28fd166a9 Mon Sep 17 00:00:00 2001 From: Trent Lo Date: Thu, 18 Jul 2019 16:44:18 -0700 Subject: [PATCH 0138/3053] Use LOG(WARNING) instead of VLOG(WARNING). --- tensorflow/core/common_runtime/bfc_allocator.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc index 7189170365c..1de9cc0b7c5 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.cc +++ b/tensorflow/core/common_runtime/bfc_allocator.cc @@ -294,13 +294,13 @@ bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes) { return false; } - VLOG(WARNING) << "Re-allocate memory regions (i.e., allocations) to avoid OOM" - << " due to memory fragmentation. If you see this message" - << " frequently, you are running near the threshold of the" - << " available device memory and re-allocation can incur great" - << " performance overhead. You may try smaller batch sizes to" - << " observe the performance impact. Alternatively you may try" - << " setting `allow_growth=false` in GPUOptions."; + LOG(WARNING) << "Re-allocate memory regions (i.e., allocations) to avoid OOM" + << " due to memory fragmentation. If you see this message" + << " frequently, you are running near the threshold of the" + << " available device memory and re-allocation can incur great" + << " performance overhead. You may try smaller batch sizes to" + << " observe the performance impact. Alternatively you may try" + << " setting `allow_growth=false` in GPUOptions."; // Deallocate free regions. DeallocateRegions(free_region_ptrs); From 7379f75705c49d33860a0dfe58b6a32b78ca6b2d Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 19 Jul 2019 00:38:09 +0000 Subject: [PATCH 0139/3053] Fix make_csv_dataset error when combined with compression type This fix tries to address the issue raised in 30849 where make_csv_dataset throw out an error if combined with compression_type. This fix address the issue by using different file io functions in case compression_type is provided. Note this fix only addresses GZIP format. For ZLIB format, as python's zlib package does not comes with a way to read file stream (only from data buffer) as gzip package, it is not supported. This fix fixes 30849. Signed-off-by: Yong Tang --- .../python/data/experimental/ops/readers.py | 34 +++++++++++++------ 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py index cf8b8c7a13e..fd87003f839 100644 --- a/tensorflow/python/data/experimental/ops/readers.py +++ b/tensorflow/python/data/experimental/ops/readers.py @@ -19,6 +19,7 @@ from __future__ import print_function import collections import csv +import gzip import functools import numpy as np @@ -37,6 +38,7 @@ from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_spec +from tensorflow.python.framework import tensor_util from tensorflow.python.lib.io import file_io from tensorflow.python.ops import gen_experimental_dataset_ops from tensorflow.python.ops import io_ops @@ -108,10 +110,10 @@ def _infer_type(str_val, na_value, prev_type): return type_list[i] -def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header): +def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header, file_io_fn): """Generator that yields rows of CSV file(s) in order.""" for fn in filenames: - with file_io.FileIO(fn, "r") as f: + with file_io_fn(fn, "r") as f: rdr = csv.reader( f, delimiter=field_delim, @@ -129,14 +131,14 @@ def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header): def _infer_column_defaults(filenames, num_cols, field_delim, use_quote_delim, na_value, header, num_rows_for_inference, - select_columns): + select_columns, file_io_fn): """Infers column types from the first N valid CSV records of files.""" if select_columns is None: select_columns = range(num_cols) inferred_types = [None] * len(select_columns) for i, csv_row in enumerate( - _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header)): + _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header, file_io_fn)): if num_rows_for_inference is not None and i >= num_rows_for_inference: break @@ -153,13 +155,13 @@ def _infer_column_defaults(filenames, num_cols, field_delim, use_quote_delim, ] -def _infer_column_names(filenames, field_delim, use_quote_delim): +def _infer_column_names(filenames, field_delim, use_quote_delim, file_io_fn): """Infers column names from first rows of files.""" csv_kwargs = { "delimiter": field_delim, "quoting": csv.QUOTE_MINIMAL if use_quote_delim else csv.QUOTE_NONE } - with file_io.FileIO(filenames[0], "r") as f: + with file_io_fn(filenames[0], "r") as f: try: column_names = next(csv.reader(f, **csv_kwargs)) except StopIteration: @@ -167,7 +169,7 @@ def _infer_column_names(filenames, field_delim, use_quote_delim): "of %s. Empty file?") % filenames[0]) for name in filenames[1:]: - with file_io.FileIO(name, "r") as f: + with file_io_fn(name, "r") as f: try: if next(csv.reader(f, **csv_kwargs)) != column_names: raise ValueError( @@ -426,12 +428,24 @@ def make_csv_dataset_v2( dataset = dataset.shuffle(len(filenames), shuffle_seed) # Clean arguments; figure out column names and defaults - + if column_names is None or column_defaults is None: + # Find out which io function to open the file + file_io_fn = file_io.FileIO + if compression_type is not None: + compression_type_value = tensor_util.constant_value(compression_type) + if compression_type_value is None: + raise ValueError("Received unkown compression_type") + if compression_type_value == "GZIP": + file_io_fn = gzip.GzipFile + elif compression_type_value == "ZLIB": + raise ValueError("compression_type (%s) is not supported for probing columns" % compression_type) + elif compression_type_value != "": + raise ValueError("compression_type (%s) is not supported" % compression_type) if column_names is None: if not header: raise ValueError("Cannot infer column names without a header line.") # If column names are not provided, infer from the header lines - column_names = _infer_column_names(filenames, field_delim, use_quote_delim) + column_names = _infer_column_names(filenames, field_delim, use_quote_delim, file_io_fn) if len(column_names) != len(set(column_names)): raise ValueError("Cannot have duplicate column names.") @@ -448,7 +462,7 @@ def make_csv_dataset_v2( # construction time column_defaults = _infer_column_defaults( filenames, len(column_names), field_delim, use_quote_delim, na_value, - header, num_rows_for_inference, select_columns) + header, num_rows_for_inference, select_columns, file_io_fn) if select_columns is not None and len(column_defaults) != len(select_columns): raise ValueError( From aaaac186b568c747966650afd0495b8c4c3b30a5 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 19 Jul 2019 00:47:21 +0000 Subject: [PATCH 0140/3053] Fix pylint issue Signed-off-by: Yong Tang --- .../python/data/experimental/ops/readers.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py index fd87003f839..6a496ba357a 100644 --- a/tensorflow/python/data/experimental/ops/readers.py +++ b/tensorflow/python/data/experimental/ops/readers.py @@ -110,7 +110,8 @@ def _infer_type(str_val, na_value, prev_type): return type_list[i] -def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header, file_io_fn): +def _next_csv_row( + filenames, num_cols, field_delim, use_quote_delim, header, file_io_fn): """Generator that yields rows of CSV file(s) in order.""" for fn in filenames: with file_io_fn(fn, "r") as f: @@ -138,7 +139,9 @@ def _infer_column_defaults(filenames, num_cols, field_delim, use_quote_delim, inferred_types = [None] * len(select_columns) for i, csv_row in enumerate( - _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header, file_io_fn)): + _next_csv_row( + filenames, num_cols, field_delim, use_quote_delim, + header, file_io_fn)): if num_rows_for_inference is not None and i >= num_rows_for_inference: break @@ -438,14 +441,18 @@ def make_csv_dataset_v2( if compression_type_value == "GZIP": file_io_fn = gzip.GzipFile elif compression_type_value == "ZLIB": - raise ValueError("compression_type (%s) is not supported for probing columns" % compression_type) + raise ValueError( + "compression_type (%s) is not supported for probing columns" % + compression_type) elif compression_type_value != "": - raise ValueError("compression_type (%s) is not supported" % compression_type) + raise ValueError( + "compression_type (%s) is not supported" % compression_type) if column_names is None: if not header: raise ValueError("Cannot infer column names without a header line.") # If column names are not provided, infer from the header lines - column_names = _infer_column_names(filenames, field_delim, use_quote_delim, file_io_fn) + column_names = _infer_column_names( + filenames, field_delim, use_quote_delim, file_io_fn) if len(column_names) != len(set(column_names)): raise ValueError("Cannot have duplicate column names.") From 44e92d03c77eda7aef51bf3af8b7edd5bf4e2744 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 19 Jul 2019 00:56:45 +0000 Subject: [PATCH 0141/3053] Add test case when no column name is specified and with compression for make_csv_dataset Signed-off-by: Yong Tang --- .../kernel_tests/make_csv_dataset_test.py | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py index 267e3e89487..ca9312f7792 100644 --- a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py @@ -221,6 +221,38 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): compression_type=compression_type, ) + def testMakeCSVDataset_withCompressionTypeAndNoColumnNames(self): + """Tests `compression_type` argument.""" + record_defaults = [ + constant_op.constant([], dtypes.int32), + constant_op.constant([], dtypes.int64), + constant_op.constant([], dtypes.float32), + constant_op.constant([], dtypes.float64), + constant_op.constant([], dtypes.string) + ] + + column_names = ["col%d" % i for i in range(5)] + inputs = [[",".join(x for x in column_names), "0,1,2,3,4", "5,6,7,8,9"], [ + ",".join(x for x in column_names), "10,11,12,13,14", "15,16,17,18,19" + ]] + expected_output = [[0, 1, 2, 3, b"4"], [5, 6, 7, 8, b"9"], + [10, 11, 12, 13, b"14"], [15, 16, 17, 18, b"19"]] + label = "col0" + + for compression_type in ["GZIP"]: + self._test_dataset( + inputs, + expected_output=expected_output, + expected_keys=column_names, + label_name=label, + batch_size=1, + num_epochs=1, + shuffle=False, + header=True, + column_defaults=record_defaults, + compression_type=compression_type, + ) + def testMakeCSVDataset_withBadInputs(self): """Tests that exception is raised when input is malformed. """ From 95cfcbddda220f1d3266bd49d04af5b82617c39a Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 19 Jul 2019 01:02:31 +0000 Subject: [PATCH 0142/3053] Add additional test case of unsupported ZLIB column probing Signed-off-by: Yong Tang --- .../kernel_tests/make_csv_dataset_test.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py index ca9312f7792..7d2da7a18c0 100644 --- a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py @@ -239,7 +239,21 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): [10, 11, 12, 13, b"14"], [15, 16, 17, 18, b"19"]] label = "col0" - for compression_type in ["GZIP"]: + self._test_dataset( + inputs, + expected_output=expected_output, + expected_keys=column_names, + label_name=label, + batch_size=1, + num_epochs=1, + shuffle=False, + header=True, + column_defaults=record_defaults, + compression_type="GZIP", + ) + + with self.assertRaisesRegexp( + ValueError, "compression_type .ZLIB. is not supported"): self._test_dataset( inputs, expected_output=expected_output, @@ -250,7 +264,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): shuffle=False, header=True, column_defaults=record_defaults, - compression_type=compression_type, + compression_type="ZLIB", ) def testMakeCSVDataset_withBadInputs(self): From e0997f50762e97c3d6e94399cb1eb2070a452acc Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Tue, 2 Jul 2019 20:19:56 +0000 Subject: [PATCH 0143/3053] Refactor nvptx_backend_lib to support both NVPTX and AMDGPU Notice nvptx_backend_lib shall better be renamed as gpu_backend_lib but it is skipped in this commit so minimize potential impacts to other XLA clients. - Created xla::gpu::nvptx namespace to store NVPTX-specific logic and values. - Created xla::gpu::amdgpu namespace to store AMDGPU-specific logic and values. - Extract platform-neutral logic to anonymous namespace. - Pass StreamExecutor* from nvptx_compiler to nvptx_backend_lib to help determine platform-specific behaviors constructing LLVM TargetMachine. - Break CompileModuleToPtx into 2 functions: - ConstructLLVMTargetMachineForModule : setup LLVM TargetMachine based on StreamExecutor* passed in from frontend. - nvptx::EmitModuleToPTX : NVPTX-specific logic to drive LLVM NVPTX backend. - Modify LinkLibdeviceIfNecessary to use LinkWithBitcodeVector. - LinkWithBitcodeVector would link a vector of paths to LLVM bitcode libs, this utility routine could support both NVPTX (libdevice) and AMDGPU (ROCm-Device-Libs). --- .../gpu/llvm_gpu_backend/nvptx_backend_lib.cc | 367 ++++++++++++------ .../gpu/llvm_gpu_backend/nvptx_backend_lib.h | 9 +- .../xla/service/gpu/nvptx_compiler.cc | 5 +- 3 files changed, 251 insertions(+), 130 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc index 9f52f09004b..3f6fca079b4 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc @@ -63,39 +63,33 @@ limitations under the License. namespace xla { namespace gpu { -namespace { + +// Forward declaration for logic specific to LLVM NVPTX backend +namespace nvptx { // Default inline threshold value to use in llvm. const int kDefaultInlineThreshold = 1100; // Gets the GPU name as it's known to LLVM for a given compute capability. If // we see an unrecognized compute capability, we return "sm_35". -static string GetSmName(std::pair compute_capability) { - static auto* m = new std::map, int>({ - {{3, 5}, 35}, - {{3, 7}, 37}, - {{5, 0}, 50}, - {{5, 2}, 52}, - {{5, 3}, 53}, - {{6, 0}, 60}, - {{6, 1}, 61}, - {{6, 2}, 62}, - {{7, 0}, 70}, - {{7, 2}, 72}, - {{7, 5}, 75}, - }); - int sm_version = 35; - auto it = m->find(compute_capability); - if (it != m->end()) { - sm_version = it->second; - } else { - LOG(WARNING) << "Unknown compute capability (" << compute_capability.first - << ", " << compute_capability.second << ") ." - << "Defaulting to telling LLVM that we're compiling for sm_" - << sm_version; - } - return absl::StrCat("sm_", sm_version); -} +static string GetSmName(std::pair compute_capability); + +Status LinkLibdeviceIfNecessary(llvm::Module* module, + std::pair compute_capability, + const string& libdevice_dir_path); +} // namespace nvptx + +// Forward declaration for logic specific to LLVM AMDGPU backend +namespace amdgpu { + +// Inline threshold value to use in LLVM AMDGPU backend. +const int kAMDGPUInlineThreshold = 1048576; + +Status LinkROCDLIfNecessary(llvm::Module* module, int amdgpu_version, + const string& rocdl_dir_path); +} // namespace amdgpu + +namespace { // Convenience function for producing a name of a temporary compilation product // from the input filename. @@ -124,7 +118,7 @@ void InitializePasses(llvm::PassRegistry* pass_registry) { // Returns the TargetMachine, given a triple. std::unique_ptr GetTargetMachine( llvm::Triple triple, absl::string_view cpu_name, - const HloModuleConfig& hlo_module_config) { + const HloModuleConfig& hlo_module_config, absl::string_view feature_str) { std::string error; const llvm::Target* target = TargetRegistry::lookupTarget("", triple, error); if (target == nullptr) { @@ -155,8 +149,9 @@ std::unique_ptr GetTargetMachine( codegen_opt_level = CodeGenOpt::None; } return absl::WrapUnique(target->createTargetMachine( - triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx60", target_options, - getRelocModel(), getCodeModel(), codegen_opt_level)); + triple.str(), llvm_ir::AsStringRef(cpu_name), + llvm_ir::AsStringRef(feature_str), target_options, getRelocModel(), + getCodeModel(), codegen_opt_level)); } // Adds the standard LLVM optimization passes, based on the speed optimization @@ -166,13 +161,14 @@ std::unique_ptr GetTargetMachine( void AddOptimizationPasses(unsigned opt_level, unsigned size_level, llvm::TargetMachine* target_machine, llvm::legacy::PassManagerBase* module_passes, - llvm::legacy::FunctionPassManager* function_passes) { + llvm::legacy::FunctionPassManager* function_passes, + int inline_threshold) { PassManagerBuilder builder; builder.OptLevel = opt_level; builder.SizeLevel = size_level; if (opt_level > 1) { - builder.Inliner = llvm::createFunctionInliningPass(kDefaultInlineThreshold); + builder.Inliner = llvm::createFunctionInliningPass(inline_threshold); } else { // Only inline functions marked with "alwaysinline". builder.Inliner = llvm::createAlwaysInlinerLegacyPass(); @@ -202,29 +198,6 @@ void EmitBitcodeToFile(const Module& module, absl::string_view filename) { outfile.keep(); } -// Emits the given module to PTX. target_machine is an initialized TargetMachine -// for the NVPTX target. -string EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) { - std::string ptx; // need a std::string instead of a ::string. - { - llvm::raw_string_ostream stream(ptx); - llvm::buffer_ostream pstream(stream); - // The extension is stripped by IrDumpingPassManager, so we need to - // get creative to add a suffix. - IrDumpingPassManager codegen_passes( - MakeNameForTempProduct(module->getModuleIdentifier(), "-nvptx.dummy"), - "", false); - codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass( - llvm::Triple(module->getTargetTriple()))); - - target_machine->addPassesToEmitFile(codegen_passes, pstream, nullptr, - llvm::TargetMachine::CGFT_AssemblyFile); - codegen_passes.run(*module); - } - - return ptx; -} - // LLVM has an extensive flags mechanism of its own, which is only accessible // through the command line. Internal libraries within LLVM register parsers for // flags, with no other way to configure them except pass these flags. @@ -243,7 +216,7 @@ void FeedLLVMWithFlags(const std::vector& cl_opts) { // Returns whether the module could use any libdevice functions. This function // may have false positives -- the module might not use libdevice even if this // function returns true. -bool CouldNeedLibdevice(const llvm::Module& module) { +bool CouldNeedDeviceBitcode(const llvm::Module& module) { for (const llvm::Function& function : module.functions()) { // This is a conservative approximation -- not all such functions are in // libdevice. @@ -254,68 +227,70 @@ bool CouldNeedLibdevice(const llvm::Module& module) { return false; } -// Links libdevice into the given module if the module needs libdevice. -Status LinkLibdeviceIfNecessary(llvm::Module* module, - std::pair compute_capability, - const string& libdevice_dir_path) { - if (!CouldNeedLibdevice(*module)) { - return Status::OK(); - } - - // CUDA 9+ uses a single libdevice file for all devices, and we don't support - // older CUDAs. - string libdevice_path = - tensorflow::io::JoinPath(libdevice_dir_path, "libdevice.10.bc"); - if (!tensorflow::Env::Default()->FileExists(libdevice_path).ok()) { - LOG(WARNING) - << "libdevice is required by this HLO module but was not found at " - << libdevice_path; - return xla::InternalError("libdevice not found at %s", libdevice_path); - } - - VLOG(1) << "Linking with libdevice from: " << libdevice_path; - std::unique_ptr libdevice_module = - LoadIRModule(libdevice_path, &module->getContext()); - +// Links the module with a vector of path to bitcode modules +// The paths are guaranteed to exist. +Status LinkWithBitcodeVector(llvm::Module* module, + const std::vector& bitcode_path_vector) { llvm::Linker linker(*module); - if (linker.linkInModule( - std::move(libdevice_module), llvm::Linker::Flags::LinkOnlyNeeded, - [](Module& M, const StringSet<>& GVS) { - internalizeModule(M, [&GVS](const GlobalValue& GV) { - return !GV.hasName() || (GVS.count(GV.getName()) == 0); - }); - })) { - return xla::InternalError("Error linking libdevice from %s", - libdevice_path); + + for (auto& bitcode_path : bitcode_path_vector) { + if (!tensorflow::Env::Default()->FileExists(bitcode_path).ok()) { + LOG(WARNING) << "bitcode module is required by this HLO module but was " + "not found at " + << bitcode_path; + return xla::InternalError("bitcode module not found at %s", bitcode_path); + } + + std::unique_ptr bitcode_module = + LoadIRModule(bitcode_path, &module->getContext()); + if (linker.linkInModule( + std::move(bitcode_module), llvm::Linker::Flags::LinkOnlyNeeded, + [](Module& M, const StringSet<>& GVS) { + internalizeModule(M, [&M, &GVS](const GlobalValue& GV) { + return !GV.hasName() || (GVS.count(GV.getName()) == 0); + }); + })) { + return xla::InternalError("Error linking bitcode module from %s", + bitcode_path); + } } return Status::OK(); } -StatusOr CompileModuleToPtx(llvm::Module* module, - std::pair compute_capability, +StatusOr> +ConstructLLVMTargetMachineForModule(llvm::Module* module, + GpuVersion gpu_version, const HloModuleConfig& hlo_module_config, - const string& libdevice_dir_path) { - // If the module has no functions or globals, there's nothing to compile. Just - // return an empty string. - if (module->empty() && module->global_empty()) { - VLOG(2) << "Module '" << module->getName().str() - << "' is empty. Skipping compilation."; - return string(); + const string& device_bitcode_dir_path, + se::StreamExecutor* stream_exec) { + // Check if we are running the backend for NVPTX or AMDGPU + bool isNVPTX = (stream_exec->platform_kind() == se::PlatformKind::kCuda); + + if (isNVPTX) { + // Link the input module with libdevice, to pull in implementations of some + // builtins. + TF_RETURN_IF_ERROR(nvptx::LinkLibdeviceIfNecessary( + module, absl::get>(gpu_version), + device_bitcode_dir_path)); + } else { + // Link the input module with ROCDL + TF_RETURN_IF_ERROR(amdgpu::LinkROCDLIfNecessary( + module, absl::get(gpu_version), device_bitcode_dir_path)); } - // Link the input module with libdevice, to pull in implementations of some - // builtins. - TF_RETURN_IF_ERROR( - LinkLibdeviceIfNecessary(module, compute_capability, libdevice_dir_path)); - // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass - // can access it. - module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", - hlo_module_config.debug_options().xla_gpu_ftz()); + // Add NVPTX-specific flags and attributes to the module + if (isNVPTX) { + // Set the flush-denormals-to-zero flag on the module so the NVVM reflect + // pass can access it. + module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", + hlo_module_config.debug_options().xla_gpu_ftz()); - // If ftz is enabled, set it as an attribute on every function in the module. - if (hlo_module_config.debug_options().xla_gpu_ftz()) { - for (llvm::Function& fn : *module) { - fn.addFnAttr("nvptx-f32ftz", "true"); + // If ftz is enabled, set it as an attribute on every function in the + // module. + if (hlo_module_config.debug_options().xla_gpu_ftz()) { + for (llvm::Function& fn : *module) { + fn.addFnAttr("nvptx-f32ftz", "true"); + } } } @@ -332,13 +307,28 @@ StatusOr CompileModuleToPtx(llvm::Module* module, llvm::Triple target_triple = llvm::Triple(module->getTargetTriple()); if (target_triple.getArch() == llvm::Triple::UnknownArch) { LOG(WARNING) << "target triple not found in the module"; - target_triple = llvm::Triple("nvptx64-unknown-unknown"); + if (isNVPTX) { + target_triple = llvm::Triple("nvptx64-unknown-unknown"); + } else { + target_triple = llvm::Triple("amdgcn--amdhsa-amdgiz"); + } + } + + // Construct LLVM TargetMachine + std::unique_ptr target_machine; + if (isNVPTX) { + // Figure out the exact name of the processor as known to the NVPTX backend + // from the gpu_architecture flag. + target_machine = GetTargetMachine( + target_triple, + nvptx::GetSmName(absl::get>(gpu_version)), + hlo_module_config, "+ptx60"); + } else { + target_machine = GetTargetMachine( + target_triple, absl::StrCat("gfx", absl::get(gpu_version)), + hlo_module_config, "-code-object-v3"); } - // Figure out the exact name of the processor as known to the NVPTX backend - // from the gpu_architecture flag. - std::unique_ptr target_machine = GetTargetMachine( - target_triple, GetSmName(compute_capability), hlo_module_config); module_passes.add(llvm::createTargetTransformInfoWrapperPass( target_machine->getTargetIRAnalysis())); @@ -365,9 +355,12 @@ StatusOr CompileModuleToPtx(llvm::Module* module, LOG(ERROR) << std::string(80, '*'); } + // Add optimization passes, and set inliner threshold AddOptimizationPasses(opt_level, /*size_level=*/0, target_machine.get(), &module_passes, - &function_passes); + &function_passes, + (isNVPTX) ? nvptx::kDefaultInlineThreshold + : amdgpu::kAMDGPUInlineThreshold); // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA // again after the standard optimization passes [http://b/13329423]. @@ -394,13 +387,87 @@ StatusOr CompileModuleToPtx(llvm::Module* module, function_passes.doFinalization(); module_passes.run(*module); - // Finally, produce PTX. - return EmitModuleToPTX(module, target_machine.get()); + return std::move(target_machine); +} +} // namespace + +// Logic specific to LLVM NVPTX backend +namespace nvptx { + +// Gets the GPU name as it's known to LLVM for a given compute capability. If +// we see an unrecognized compute capability, we return "sm_35". +static string GetSmName(std::pair compute_capability) { + static auto* m = new std::map, int>({ + {{3, 5}, 35}, + {{3, 7}, 37}, + {{5, 0}, 50}, + {{5, 2}, 52}, + {{5, 3}, 53}, + {{6, 0}, 60}, + {{6, 1}, 61}, + {{6, 2}, 62}, + {{7, 0}, 70}, + {{7, 2}, 72}, + {{7, 5}, 75}, + }); + int sm_version = 35; + auto it = m->find(compute_capability); + if (it != m->end()) { + sm_version = it->second; + } else { + LOG(WARNING) << "Unknown compute capability (" << compute_capability.first + << ", " << compute_capability.second << ") ." + << "Defaulting to telling LLVM that we're compiling for sm_" + << sm_version; + } + return absl::StrCat("sm_", sm_version); +} + +// Emits the given module to PTX. target_machine is an initialized TargetMachine +// for the NVPTX target. +StatusOr EmitModuleToPTX(Module* module, + llvm::TargetMachine* target_machine) { + std::string ptx; // need a std::string instead of a ::string. + { + llvm::raw_string_ostream stream(ptx); + llvm::buffer_ostream pstream(stream); + // The extension is stripped by IrDumpingPassManager, so we need to + // get creative to add a suffix. + IrDumpingPassManager codegen_passes( + MakeNameForTempProduct(module->getModuleIdentifier(), "-nvptx.dummy"), + "", false); + codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass( + llvm::Triple(module->getTargetTriple()))); + + target_machine->addPassesToEmitFile(codegen_passes, pstream, nullptr, + llvm::TargetMachine::CGFT_AssemblyFile); + codegen_passes.run(*module); + } + + return ptx; +} + +// Links libdevice into the given module if the module needs libdevice. +Status LinkLibdeviceIfNecessary(llvm::Module* module, + std::pair compute_capability, + const string& libdevice_dir_path) { + if (!CouldNeedDeviceBitcode(*module)) { + return Status::OK(); + } + + // CUDA 9+ uses a single libdevice file for all devices, and we don't support + // older CUDAs. + string libdevice_path = + tensorflow::io::JoinPath(libdevice_dir_path, "libdevice.10.bc"); + + VLOG(1) << "Linking with libdevice from: " << libdevice_path; + std::vector libdevice_path_vector{libdevice_path}; + return LinkWithBitcodeVector(module, libdevice_path_vector); } // One-time module initializer. // Must be called only once -- DO NOT CALL DIRECTLY. -void GPUBackendInit(const HloModuleConfig& hlo_module_config) { +void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) { // Feed all customized flags here, so we can override them with llvm_cl_opts // without redeploy the compiler for development purpose. @@ -444,24 +511,74 @@ void GPUBackendInit(const HloModuleConfig& hlo_module_config) { InitializePasses(registry); } -} // namespace +} // namespace nvptx + +// Logic specific to LLVM AMDGPU backend +namespace amdgpu { + +// Gets the ROCm-Device-Libs filenames for a particular AMDGPU version. +static std::vector GetROCDLPaths(int amdgpu_version, + const string& rocdl_dir_path) { + // AMDGPU version-neutral bitcodes + std::vector result{"hc.amdgcn.bc", + "opencl.amdgcn.bc", + "ocml.amdgcn.bc", + "ockl.amdgcn.bc", + "oclc_finite_only_off.amdgcn.bc", + "oclc_daz_opt_off.amdgcn.bc", + "oclc_correctly_rounded_sqrt_on.amdgcn.bc", + "oclc_unsafe_math_off.amdgcn.bc"}; + + // AMDGPU version-specific bitcodes + result.push_back(tensorflow::io::JoinPath( + rocdl_dir_path, tensorflow::strings::StrCat( + "oclc_isa_version_", amdgpu_version, ".amdgcn.bc"))); + return std::move(result); +} + +// Links ROCm-Device-Libs into the given module if the module needs it. +Status LinkROCDLIfNecessary(llvm::Module* module, int amdgpu_version, + const string& rocdl_dir_path) { + if (!CouldNeedDeviceBitcode(*module)) { + return tensorflow::Status::OK(); + } + + return LinkWithBitcodeVector(module, + GetROCDLPaths(amdgpu_version, rocdl_dir_path)); +} + +} // namespace amdgpu StatusOr CompileToPtx(llvm::Module* module, - std::pair compute_capability, + GpuVersion gpu_version, const HloModuleConfig& hlo_module_config, - const string& libdevice_dir_path) { + const string& libdevice_dir_path, + se::StreamExecutor* stream_exec) { static std::once_flag backend_init_flag; - std::call_once(backend_init_flag, GPUBackendInit, hlo_module_config); + std::call_once(backend_init_flag, nvptx::NVPTXBackendInit, hlo_module_config); string ptx; + std::unique_ptr target_machine; { tensorflow::profiler::TraceMe activity( [&] { return absl::StrCat("Compiling IR:", module->getName().str()); }, tensorflow::profiler::TraceMeLevel::kInfo); XLA_SCOPED_LOGGING_TIMER("Compile module " + module->getName().str()); - TF_ASSIGN_OR_RETURN( - ptx, CompileModuleToPtx(module, compute_capability, hlo_module_config, - libdevice_dir_path)); + + // If the module has no functions or globals, there's nothing to compile. + // Just return an empty string. + if (module->empty() && module->global_empty()) { + VLOG(2) << "Module '" << module->getName().str() + << "' is empty. Skipping compilation."; + return string(); + } + + TF_ASSIGN_OR_RETURN(target_machine, + ConstructLLVMTargetMachineForModule( + module, gpu_version, hlo_module_config, + libdevice_dir_path, stream_exec)); + TF_ASSIGN_OR_RETURN(ptx, + nvptx::EmitModuleToPTX(module, target_machine.get())); } return ptx; } diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h index 9654175bfaf..a4e8c925328 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h @@ -21,6 +21,7 @@ limitations under the License. #include #include "absl/strings/string_view.h" +#include "absl/types/variant.h" #include "llvm/IR/Module.h" #include "tensorflow/compiler/xla/service/hlo_module_config.h" #include "tensorflow/compiler/xla/statusor.h" @@ -29,6 +30,8 @@ limitations under the License. namespace xla { namespace gpu { +using GpuVersion = absl::variant, int>; + // Compiles the argument module and returns it. libdevice_dir_path is the parent // directory of the libdevice bitcode libraries. The contents of the module may // be changed. @@ -36,10 +39,10 @@ namespace gpu { // The Compile.* interfaces each create their own llvm::LLVMContext objects for // thread safety, but note that LLVM's multithreaded support is very // preliminary; multithreaded use is not recommended at this time. -StatusOr CompileToPtx(llvm::Module* module, - std::pair compute_capability, +StatusOr CompileToPtx(llvm::Module* module, GpuVersion gpu_version, const HloModuleConfig& hlo_module_config, - const string& libdevice_dir_path); + const string& libdevice_dir_path, + se::StreamExecutor* stream_exec); } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc index 20b3d64c417..14f464ab702 100644 --- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc @@ -628,8 +628,9 @@ StatusOr> NVPTXCompiler::RunBackend( string ptx; { XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend - CompileToPtx"); - TF_ASSIGN_OR_RETURN(ptx, CompileToPtx(&llvm_module, {cc_major, cc_minor}, - module->config(), libdevice_dir)); + TF_ASSIGN_OR_RETURN( + ptx, CompileToPtx(&llvm_module, std::pair{cc_major, cc_minor}, + module->config(), libdevice_dir, stream_exec)); } llvm_ir::DumpIrIfEnabled(*module, llvm_module, /*optimized=*/true); From 9a53d54e74c23b66d2ba2cf2cdb4bed56022f02a Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Wed, 3 Jul 2019 15:17:07 +0000 Subject: [PATCH 0144/3053] Fix ROCDL path processing logic --- .../gpu/llvm_gpu_backend/nvptx_backend_lib.cc | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc index 3f6fca079b4..cd0b3a35b89 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc @@ -520,16 +520,23 @@ namespace amdgpu { static std::vector GetROCDLPaths(int amdgpu_version, const string& rocdl_dir_path) { // AMDGPU version-neutral bitcodes - std::vector result{"hc.amdgcn.bc", - "opencl.amdgcn.bc", - "ocml.amdgcn.bc", - "ockl.amdgcn.bc", - "oclc_finite_only_off.amdgcn.bc", - "oclc_daz_opt_off.amdgcn.bc", - "oclc_correctly_rounded_sqrt_on.amdgcn.bc", - "oclc_unsafe_math_off.amdgcn.bc"}; + std::vector rocdl_filename_vector{ + "hc.amdgcn.bc", + "opencl.amdgcn.bc", + "ocml.amdgcn.bc", + "ockl.amdgcn.bc", + "oclc_finite_only_off.amdgcn.bc", + "oclc_daz_opt_off.amdgcn.bc", + "oclc_correctly_rounded_sqrt_on.amdgcn.bc", + "oclc_unsafe_math_off.amdgcn.bc"}; - // AMDGPU version-specific bitcodes + // Construct full path to ROCDL bitcode libraries + std::vector result; + for (auto& filename : rocdl_filename_vector) { + result.push_back(tensorflow::io::JoinPath(rocdl_dir_path, filename)); + } + + // Add AMDGPU version-specific bitcodes result.push_back(tensorflow::io::JoinPath( rocdl_dir_path, tensorflow::strings::StrCat( "oclc_isa_version_", amdgpu_version, ".amdgcn.bc"))); From 1275263f843a81d8479133387a37c59b87918c78 Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Wed, 3 Jul 2019 19:23:18 -0500 Subject: [PATCH 0145/3053] Remove undesirable StreamExecutor from LLVM backend interface --- .../gpu/llvm_gpu_backend/nvptx_backend_lib.cc | 12 +++++------- .../service/gpu/llvm_gpu_backend/nvptx_backend_lib.h | 3 +-- .../compiler/xla/service/gpu/nvptx_compiler.cc | 2 +- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc index cd0b3a35b89..68c992f929f 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc @@ -261,10 +261,10 @@ StatusOr> ConstructLLVMTargetMachineForModule(llvm::Module* module, GpuVersion gpu_version, const HloModuleConfig& hlo_module_config, - const string& device_bitcode_dir_path, - se::StreamExecutor* stream_exec) { + const string& device_bitcode_dir_path) { // Check if we are running the backend for NVPTX or AMDGPU - bool isNVPTX = (stream_exec->platform_kind() == se::PlatformKind::kCuda); + llvm::Triple target_triple = llvm::Triple(module->getTargetTriple()); + bool isNVPTX = target_triple.isNVPTX(); if (isNVPTX) { // Link the input module with libdevice, to pull in implementations of some @@ -304,7 +304,6 @@ ConstructLLVMTargetMachineForModule(llvm::Module* module, // Try to fetch the target triple from the module. If not present, set a // default target triple. - llvm::Triple target_triple = llvm::Triple(module->getTargetTriple()); if (target_triple.getArch() == llvm::Triple::UnknownArch) { LOG(WARNING) << "target triple not found in the module"; if (isNVPTX) { @@ -559,8 +558,7 @@ Status LinkROCDLIfNecessary(llvm::Module* module, int amdgpu_version, StatusOr CompileToPtx(llvm::Module* module, GpuVersion gpu_version, const HloModuleConfig& hlo_module_config, - const string& libdevice_dir_path, - se::StreamExecutor* stream_exec) { + const string& libdevice_dir_path) { static std::once_flag backend_init_flag; std::call_once(backend_init_flag, nvptx::NVPTXBackendInit, hlo_module_config); @@ -583,7 +581,7 @@ StatusOr CompileToPtx(llvm::Module* module, TF_ASSIGN_OR_RETURN(target_machine, ConstructLLVMTargetMachineForModule( module, gpu_version, hlo_module_config, - libdevice_dir_path, stream_exec)); + libdevice_dir_path)); TF_ASSIGN_OR_RETURN(ptx, nvptx::EmitModuleToPTX(module, target_machine.get())); } diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h index a4e8c925328..e0990f2c6a9 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h @@ -41,8 +41,7 @@ using GpuVersion = absl::variant, int>; // preliminary; multithreaded use is not recommended at this time. StatusOr CompileToPtx(llvm::Module* module, GpuVersion gpu_version, const HloModuleConfig& hlo_module_config, - const string& libdevice_dir_path, - se::StreamExecutor* stream_exec); + const string& libdevice_dir_path); } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc index 14f464ab702..8161bcecc92 100644 --- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc @@ -630,7 +630,7 @@ StatusOr> NVPTXCompiler::RunBackend( XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend - CompileToPtx"); TF_ASSIGN_OR_RETURN( ptx, CompileToPtx(&llvm_module, std::pair{cc_major, cc_minor}, - module->config(), libdevice_dir, stream_exec)); + module->config(), libdevice_dir)); } llvm_ir::DumpIrIfEnabled(*module, llvm_module, /*optimized=*/true); From 0da64425d6c06c2bf14d74d92ad6f8d4526ca500 Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Mon, 8 Jul 2019 15:47:19 +0000 Subject: [PATCH 0146/3053] Address code review comments. --- .../gpu/llvm_gpu_backend/nvptx_backend_lib.cc | 462 ++++++++++-------- .../gpu/llvm_gpu_backend/nvptx_backend_lib.h | 8 +- .../xla/service/gpu/nvptx_compiler.cc | 6 +- 3 files changed, 267 insertions(+), 209 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc index 68c992f929f..271bc3f3a6d 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc @@ -64,7 +64,13 @@ limitations under the License. namespace xla { namespace gpu { -// Forward declaration for logic specific to LLVM NVPTX backend +namespace amdgpu { + +// Inline threshold value to use in LLVM AMDGPU backend. +const int kAMDGPUInlineThreshold = 0x100000; + +} // namespace amdgpu + namespace nvptx { // Default inline threshold value to use in llvm. @@ -72,23 +78,35 @@ const int kDefaultInlineThreshold = 1100; // Gets the GPU name as it's known to LLVM for a given compute capability. If // we see an unrecognized compute capability, we return "sm_35". -static string GetSmName(std::pair compute_capability); +static string GetSmName(std::pair compute_capability) { + static auto* m = new std::map, int>({ + {{3, 5}, 35}, + {{3, 7}, 37}, + {{5, 0}, 50}, + {{5, 2}, 52}, + {{5, 3}, 53}, + {{6, 0}, 60}, + {{6, 1}, 61}, + {{6, 2}, 62}, + {{7, 0}, 70}, + {{7, 2}, 72}, + {{7, 5}, 75}, + }); + int sm_version = 35; + auto it = m->find(compute_capability); + if (it != m->end()) { + sm_version = it->second; + } else { + LOG(WARNING) << "Unknown compute capability (" << compute_capability.first + << ", " << compute_capability.second << ") ." + << "Defaulting to telling LLVM that we're compiling for sm_" + << sm_version; + } + return absl::StrCat("sm_", sm_version); +} -Status LinkLibdeviceIfNecessary(llvm::Module* module, - std::pair compute_capability, - const string& libdevice_dir_path); } // namespace nvptx -// Forward declaration for logic specific to LLVM AMDGPU backend -namespace amdgpu { - -// Inline threshold value to use in LLVM AMDGPU backend. -const int kAMDGPUInlineThreshold = 1048576; - -Status LinkROCDLIfNecessary(llvm::Module* module, int amdgpu_version, - const string& rocdl_dir_path); -} // namespace amdgpu - namespace { // Convenience function for producing a name of a temporary compilation product @@ -198,6 +216,36 @@ void EmitBitcodeToFile(const Module& module, absl::string_view filename) { outfile.keep(); } +} // namespace + +namespace nvptx { +// Emits the given module to PTX. target_machine is an initialized TargetMachine +// for the NVPTX target. +StatusOr EmitModuleToPTX(Module* module, + llvm::TargetMachine* target_machine) { + std::string ptx; // need a std::string instead of a ::string. + { + llvm::raw_string_ostream stream(ptx); + llvm::buffer_ostream pstream(stream); + // The extension is stripped by IrDumpingPassManager, so we need to + // get creative to add a suffix. + IrDumpingPassManager codegen_passes( + MakeNameForTempProduct(module->getModuleIdentifier(), "-nvptx.dummy"), + "", false); + codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass( + llvm::Triple(module->getTargetTriple()))); + + target_machine->addPassesToEmitFile(codegen_passes, pstream, nullptr, + llvm::TargetMachine::CGFT_AssemblyFile); + codegen_passes.run(*module); + } + + return ptx; +} + +} // namespace nvptx + +namespace { // LLVM has an extensive flags mechanism of its own, which is only accessible // through the command line. Internal libraries within LLVM register parsers for // flags, with no other way to configure them except pass these flags. @@ -213,13 +261,13 @@ void FeedLLVMWithFlags(const std::vector& cl_opts) { llvm::cl::ParseCommandLineOptions(fake_argv.size(), &fake_argv[0]); } -// Returns whether the module could use any libdevice functions. This function -// may have false positives -- the module might not use libdevice even if this -// function returns true. +// Returns whether the module could use any device bitcode library functions. +// This function may have false positives -- the module might not use libdevice +// on NVPTX or ROCm-Device-Libs on AMDGPU even if this function returns true. bool CouldNeedDeviceBitcode(const llvm::Module& module) { for (const llvm::Function& function : module.functions()) { // This is a conservative approximation -- not all such functions are in - // libdevice. + // libdevice or ROCm-Device-Libs. if (!function.isIntrinsic() && function.isDeclaration()) { return true; } @@ -227,8 +275,8 @@ bool CouldNeedDeviceBitcode(const llvm::Module& module) { return false; } -// Links the module with a vector of path to bitcode modules -// The paths are guaranteed to exist. +// Links the module with a vector of path to bitcode modules. +// The caller must guarantee that the paths exist. Status LinkWithBitcodeVector(llvm::Module* module, const std::vector& bitcode_path_vector) { llvm::Linker linker(*module); @@ -257,40 +305,53 @@ Status LinkWithBitcodeVector(llvm::Module* module, return Status::OK(); } -StatusOr> -ConstructLLVMTargetMachineForModule(llvm::Module* module, - GpuVersion gpu_version, - const HloModuleConfig& hlo_module_config, - const string& device_bitcode_dir_path) { - // Check if we are running the backend for NVPTX or AMDGPU - llvm::Triple target_triple = llvm::Triple(module->getTargetTriple()); - bool isNVPTX = target_triple.isNVPTX(); +} // namespace - if (isNVPTX) { - // Link the input module with libdevice, to pull in implementations of some - // builtins. - TF_RETURN_IF_ERROR(nvptx::LinkLibdeviceIfNecessary( - module, absl::get>(gpu_version), - device_bitcode_dir_path)); - } else { - // Link the input module with ROCDL - TF_RETURN_IF_ERROR(amdgpu::LinkROCDLIfNecessary( - module, absl::get(gpu_version), device_bitcode_dir_path)); +namespace nvptx { + +// Links libdevice into the given module if the module needs libdevice. +Status LinkLibdeviceIfNecessary(llvm::Module* module, + std::pair compute_capability, + const string& libdevice_dir_path) { + if (!CouldNeedDeviceBitcode(*module)) { + return Status::OK(); } - // Add NVPTX-specific flags and attributes to the module - if (isNVPTX) { - // Set the flush-denormals-to-zero flag on the module so the NVVM reflect - // pass can access it. - module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", - hlo_module_config.debug_options().xla_gpu_ftz()); + // CUDA 9+ uses a single libdevice file for all devices, and we don't support + // older CUDAs. + string libdevice_path = + tensorflow::io::JoinPath(libdevice_dir_path, "libdevice.10.bc"); + if (!tensorflow::Env::Default()->FileExists(libdevice_path).ok()) { + LOG(WARNING) + << "libdevice is required by this HLO module but was not found at " + << libdevice_path; + return xla::InternalError("libdevice not found at %s", libdevice_path); + } - // If ftz is enabled, set it as an attribute on every function in the - // module. - if (hlo_module_config.debug_options().xla_gpu_ftz()) { - for (llvm::Function& fn : *module) { - fn.addFnAttr("nvptx-f32ftz", "true"); - } + VLOG(1) << "Linking with libdevice from: " << libdevice_path; + std::vector libdevice_path_vector{libdevice_path}; + return LinkWithBitcodeVector(module, libdevice_path_vector); +} + +StatusOr> LinkAndOptimizeModule( + llvm::Module* module, std::pair compute_capability, + const HloModuleConfig& hlo_module_config, + const string& device_bitcode_dir_path) { + // Link the input module with libdevice, to pull in implementations of some + // builtins. + TF_RETURN_IF_ERROR(LinkLibdeviceIfNecessary(module, compute_capability, + device_bitcode_dir_path)); + + // Set the flush-denormals-to-zero flag on the module so the NVVM reflect + // pass can access it. + module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", + hlo_module_config.debug_options().xla_gpu_ftz()); + + // If ftz is enabled, set it as an attribute on every function in the + // module. + if (hlo_module_config.debug_options().xla_gpu_ftz()) { + for (llvm::Function& fn : *module) { + fn.addFnAttr("nvptx-f32ftz", "true"); } } @@ -304,29 +365,17 @@ ConstructLLVMTargetMachineForModule(llvm::Module* module, // Try to fetch the target triple from the module. If not present, set a // default target triple. + llvm::Triple target_triple = llvm::Triple(module->getTargetTriple()); if (target_triple.getArch() == llvm::Triple::UnknownArch) { LOG(WARNING) << "target triple not found in the module"; - if (isNVPTX) { - target_triple = llvm::Triple("nvptx64-unknown-unknown"); - } else { - target_triple = llvm::Triple("amdgcn--amdhsa-amdgiz"); - } + target_triple = llvm::Triple("nvptx64-unknown-unknown"); } - // Construct LLVM TargetMachine - std::unique_ptr target_machine; - if (isNVPTX) { - // Figure out the exact name of the processor as known to the NVPTX backend - // from the gpu_architecture flag. - target_machine = GetTargetMachine( - target_triple, - nvptx::GetSmName(absl::get>(gpu_version)), - hlo_module_config, "+ptx60"); - } else { - target_machine = GetTargetMachine( - target_triple, absl::StrCat("gfx", absl::get(gpu_version)), - hlo_module_config, "-code-object-v3"); - } + // Figure out the exact name of the processor as known to the NVPTX backend + // from the gpu_architecture flag. + std::unique_ptr target_machine = + GetTargetMachine(target_triple, GetSmName(compute_capability), + hlo_module_config, "+ptx60"); module_passes.add(llvm::createTargetTransformInfoWrapperPass( target_machine->getTargetIRAnalysis())); @@ -354,12 +403,10 @@ ConstructLLVMTargetMachineForModule(llvm::Module* module, LOG(ERROR) << std::string(80, '*'); } - // Add optimization passes, and set inliner threshold + // Add optimization passes, and set inliner threshold. AddOptimizationPasses(opt_level, /*size_level=*/0, target_machine.get(), &module_passes, - &function_passes, - (isNVPTX) ? nvptx::kDefaultInlineThreshold - : amdgpu::kAMDGPUInlineThreshold); + &function_passes, kDefaultInlineThreshold); // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA // again after the standard optimization passes [http://b/13329423]. @@ -388,81 +435,6 @@ ConstructLLVMTargetMachineForModule(llvm::Module* module, return std::move(target_machine); } -} // namespace - -// Logic specific to LLVM NVPTX backend -namespace nvptx { - -// Gets the GPU name as it's known to LLVM for a given compute capability. If -// we see an unrecognized compute capability, we return "sm_35". -static string GetSmName(std::pair compute_capability) { - static auto* m = new std::map, int>({ - {{3, 5}, 35}, - {{3, 7}, 37}, - {{5, 0}, 50}, - {{5, 2}, 52}, - {{5, 3}, 53}, - {{6, 0}, 60}, - {{6, 1}, 61}, - {{6, 2}, 62}, - {{7, 0}, 70}, - {{7, 2}, 72}, - {{7, 5}, 75}, - }); - int sm_version = 35; - auto it = m->find(compute_capability); - if (it != m->end()) { - sm_version = it->second; - } else { - LOG(WARNING) << "Unknown compute capability (" << compute_capability.first - << ", " << compute_capability.second << ") ." - << "Defaulting to telling LLVM that we're compiling for sm_" - << sm_version; - } - return absl::StrCat("sm_", sm_version); -} - -// Emits the given module to PTX. target_machine is an initialized TargetMachine -// for the NVPTX target. -StatusOr EmitModuleToPTX(Module* module, - llvm::TargetMachine* target_machine) { - std::string ptx; // need a std::string instead of a ::string. - { - llvm::raw_string_ostream stream(ptx); - llvm::buffer_ostream pstream(stream); - // The extension is stripped by IrDumpingPassManager, so we need to - // get creative to add a suffix. - IrDumpingPassManager codegen_passes( - MakeNameForTempProduct(module->getModuleIdentifier(), "-nvptx.dummy"), - "", false); - codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass( - llvm::Triple(module->getTargetTriple()))); - - target_machine->addPassesToEmitFile(codegen_passes, pstream, nullptr, - llvm::TargetMachine::CGFT_AssemblyFile); - codegen_passes.run(*module); - } - - return ptx; -} - -// Links libdevice into the given module if the module needs libdevice. -Status LinkLibdeviceIfNecessary(llvm::Module* module, - std::pair compute_capability, - const string& libdevice_dir_path) { - if (!CouldNeedDeviceBitcode(*module)) { - return Status::OK(); - } - - // CUDA 9+ uses a single libdevice file for all devices, and we don't support - // older CUDAs. - string libdevice_path = - tensorflow::io::JoinPath(libdevice_dir_path, "libdevice.10.bc"); - - VLOG(1) << "Linking with libdevice from: " << libdevice_path; - std::vector libdevice_path_vector{libdevice_path}; - return LinkWithBitcodeVector(module, libdevice_path_vector); -} // One-time module initializer. // Must be called only once -- DO NOT CALL DIRECTLY. @@ -510,57 +482,12 @@ void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) { InitializePasses(registry); } -} // namespace nvptx - -// Logic specific to LLVM AMDGPU backend -namespace amdgpu { - -// Gets the ROCm-Device-Libs filenames for a particular AMDGPU version. -static std::vector GetROCDLPaths(int amdgpu_version, - const string& rocdl_dir_path) { - // AMDGPU version-neutral bitcodes - std::vector rocdl_filename_vector{ - "hc.amdgcn.bc", - "opencl.amdgcn.bc", - "ocml.amdgcn.bc", - "ockl.amdgcn.bc", - "oclc_finite_only_off.amdgcn.bc", - "oclc_daz_opt_off.amdgcn.bc", - "oclc_correctly_rounded_sqrt_on.amdgcn.bc", - "oclc_unsafe_math_off.amdgcn.bc"}; - - // Construct full path to ROCDL bitcode libraries - std::vector result; - for (auto& filename : rocdl_filename_vector) { - result.push_back(tensorflow::io::JoinPath(rocdl_dir_path, filename)); - } - - // Add AMDGPU version-specific bitcodes - result.push_back(tensorflow::io::JoinPath( - rocdl_dir_path, tensorflow::strings::StrCat( - "oclc_isa_version_", amdgpu_version, ".amdgcn.bc"))); - return std::move(result); -} - -// Links ROCm-Device-Libs into the given module if the module needs it. -Status LinkROCDLIfNecessary(llvm::Module* module, int amdgpu_version, - const string& rocdl_dir_path) { - if (!CouldNeedDeviceBitcode(*module)) { - return tensorflow::Status::OK(); - } - - return LinkWithBitcodeVector(module, - GetROCDLPaths(amdgpu_version, rocdl_dir_path)); -} - -} // namespace amdgpu - StatusOr CompileToPtx(llvm::Module* module, - GpuVersion gpu_version, + std::pair compute_capability, const HloModuleConfig& hlo_module_config, const string& libdevice_dir_path) { static std::once_flag backend_init_flag; - std::call_once(backend_init_flag, nvptx::NVPTXBackendInit, hlo_module_config); + std::call_once(backend_init_flag, NVPTXBackendInit, hlo_module_config); string ptx; std::unique_ptr target_machine; @@ -578,15 +505,146 @@ StatusOr CompileToPtx(llvm::Module* module, return string(); } - TF_ASSIGN_OR_RETURN(target_machine, - ConstructLLVMTargetMachineForModule( - module, gpu_version, hlo_module_config, - libdevice_dir_path)); - TF_ASSIGN_OR_RETURN(ptx, - nvptx::EmitModuleToPTX(module, target_machine.get())); + TF_ASSIGN_OR_RETURN( + target_machine, + LinkAndOptimizeModule(module, compute_capability, hlo_module_config, + libdevice_dir_path)); + TF_ASSIGN_OR_RETURN(ptx, EmitModuleToPTX(module, target_machine.get())); } return ptx; } +} // namespace nvptx + +namespace amdgpu { + +// Gets the ROCm-Device-Libs filenames for a particular AMDGPU version. +static std::vector GetROCDLPaths(int amdgpu_version, + const string& rocdl_dir_path) { + // AMDGPU version-neutral bitcodes. + std::vector rocdl_filename_vector{ + "hc.amdgcn.bc", + "opencl.amdgcn.bc", + "ocml.amdgcn.bc", + "ockl.amdgcn.bc", + "oclc_finite_only_off.amdgcn.bc", + "oclc_daz_opt_off.amdgcn.bc", + "oclc_correctly_rounded_sqrt_on.amdgcn.bc", + "oclc_unsafe_math_off.amdgcn.bc"}; + + // Construct full path to ROCDL bitcode libraries. + std::vector result; + for (auto& filename : rocdl_filename_vector) { + result.push_back(tensorflow::io::JoinPath(rocdl_dir_path, filename)); + } + + // Add AMDGPU version-specific bitcodes. + result.push_back(tensorflow::io::JoinPath( + rocdl_dir_path, tensorflow::strings::StrCat( + "oclc_isa_version_", amdgpu_version, ".amdgcn.bc"))); + return std::move(result); +} + +// Links ROCm-Device-Libs into the given module if the module needs it. +Status LinkROCDLIfNecessary(llvm::Module* module, int amdgpu_version, + const string& rocdl_dir_path) { + if (!CouldNeedDeviceBitcode(*module)) { + return tensorflow::Status::OK(); + } + + return LinkWithBitcodeVector(module, + GetROCDLPaths(amdgpu_version, rocdl_dir_path)); +} + +StatusOr> LinkAndOptimizeModule( + llvm::Module* module, int amdgpu_version, + const HloModuleConfig& hlo_module_config, + const string& device_bitcode_dir_path) { + // Link the input module with ROCDL. + TF_RETURN_IF_ERROR(amdgpu::LinkROCDLIfNecessary(module, amdgpu_version, + device_bitcode_dir_path)); + + IrDumpingPassManager module_passes(module->getModuleIdentifier(), "", false); + + // Add an appropriate TargetLibraryInfo pass for the module's triple. + llvm::TargetLibraryInfoWrapperPass* tliwp = + new llvm::TargetLibraryInfoWrapperPass( + llvm::Triple(module->getTargetTriple())); + module_passes.add(tliwp); + + // Try to fetch the target triple from the module. If not present, set a + // default target triple. + llvm::Triple target_triple = llvm::Triple(module->getTargetTriple()); + if (target_triple.getArch() == llvm::Triple::UnknownArch) { + LOG(WARNING) << "target triple not found in the module"; + target_triple = llvm::Triple("amdgcn--amdhsa-amdgiz"); + } + + // Construct LLVM TargetMachine. + std::unique_ptr target_machine = GetTargetMachine( + target_triple, absl::StrCat("gfx", amdgpu_version), + hlo_module_config, "-code-object-v3"); + + module_passes.add(llvm::createTargetTransformInfoWrapperPass( + target_machine->getTargetIRAnalysis())); + + // The LLVM IR verifier performs sanity checking on the IR. This helps + // discover problems and report them in a meaningful manner, rather than let + // later passes report obscure assertions because of unfulfilled invariants. + module_passes.add(llvm::createVerifierPass()); + + // Create the function-level pass manager. It needs data layout information + // too. + llvm::legacy::FunctionPassManager function_passes(module); + + int32 opt_level = + hlo_module_config.debug_options().xla_backend_optimization_level(); + + if (opt_level < 2) { + LOG(ERROR) << std::string(80, '*'); + LOG(ERROR) << "The XLA GPU backend doesn't support unoptimized code " + "generation but "; + LOG(ERROR) << "--xla_backend_optimization_level is set to " << opt_level + << "!"; + LOG(ERROR) << "(Supported configuration is " + "--xla_backend_optimization_level >= 2.)"; + LOG(ERROR) << std::string(80, '*'); + } + + // Add optimization passes, and set inliner threshold. + AddOptimizationPasses(opt_level, + /*size_level=*/0, target_machine.get(), &module_passes, + &function_passes, kAMDGPUInlineThreshold); + + // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA + // again after the standard optimization passes [http://b/13329423]. + // TODO(jingyue): SROA may further expose more optimization opportunities such + // as more precise alias analysis and more function inlining (SROA may change + // the inlining cost of a function). For now, running SROA already emits good + // enough code for the evaluated benchmarks. We may want to run more + // optimizations later. + if (opt_level > 0) { + // LLVM's optimizer turns on SROA when the optimization level is greater + // than 0. We mimic this behavior here. + module_passes.add(llvm::createSROAPass()); + } + + // Verify that the module is well formed after optimizations ran. + module_passes.add(llvm::createVerifierPass()); + + // Done populating the pass managers. Now run them. + + function_passes.doInitialization(); + for (auto func = module->begin(); func != module->end(); ++func) { + function_passes.run(*func); + } + function_passes.doFinalization(); + module_passes.run(*module); + + return std::move(target_machine); +} + +} // namespace amdgpu + } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h index e0990f2c6a9..d1528dd3604 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h @@ -21,7 +21,6 @@ limitations under the License. #include #include "absl/strings/string_view.h" -#include "absl/types/variant.h" #include "llvm/IR/Module.h" #include "tensorflow/compiler/xla/service/hlo_module_config.h" #include "tensorflow/compiler/xla/statusor.h" @@ -30,8 +29,7 @@ limitations under the License. namespace xla { namespace gpu { -using GpuVersion = absl::variant, int>; - +namespace nvptx { // Compiles the argument module and returns it. libdevice_dir_path is the parent // directory of the libdevice bitcode libraries. The contents of the module may // be changed. @@ -39,9 +37,11 @@ using GpuVersion = absl::variant, int>; // The Compile.* interfaces each create their own llvm::LLVMContext objects for // thread safety, but note that LLVM's multithreaded support is very // preliminary; multithreaded use is not recommended at this time. -StatusOr CompileToPtx(llvm::Module* module, GpuVersion gpu_version, +StatusOr CompileToPtx(llvm::Module* module, + std::pair compute_capability, const HloModuleConfig& hlo_module_config, const string& libdevice_dir_path); +} // namespace nvptx } // namespace gpu } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc index 8161bcecc92..86915d9bce6 100644 --- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc @@ -628,9 +628,9 @@ StatusOr> NVPTXCompiler::RunBackend( string ptx; { XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend - CompileToPtx"); - TF_ASSIGN_OR_RETURN( - ptx, CompileToPtx(&llvm_module, std::pair{cc_major, cc_minor}, - module->config(), libdevice_dir)); + TF_ASSIGN_OR_RETURN(ptx, + nvptx::CompileToPtx(&llvm_module, {cc_major, cc_minor}, + module->config(), libdevice_dir)); } llvm_ir::DumpIrIfEnabled(*module, llvm_module, /*optimized=*/true); From 3b5c39e043078f875d14abc5ca6e3947ad14bd10 Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Tue, 9 Jul 2019 08:46:42 -0500 Subject: [PATCH 0147/3053] Address code review comments. --- .../xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc index 271bc3f3a6d..963719577be 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc @@ -342,13 +342,12 @@ StatusOr> LinkAndOptimizeModule( TF_RETURN_IF_ERROR(LinkLibdeviceIfNecessary(module, compute_capability, device_bitcode_dir_path)); - // Set the flush-denormals-to-zero flag on the module so the NVVM reflect - // pass can access it. + // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass + // can access it. module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", hlo_module_config.debug_options().xla_gpu_ftz()); - // If ftz is enabled, set it as an attribute on every function in the - // module. + // If ftz is enabled, set it as an attribute on every function in the module. if (hlo_module_config.debug_options().xla_gpu_ftz()) { for (llvm::Function& fn : *module) { fn.addFnAttr("nvptx-f32ftz", "true"); From 955c5a1ed3283010db831b86e4e5aed3302b0848 Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Tue, 9 Jul 2019 20:52:25 +0000 Subject: [PATCH 0148/3053] Re-introduce GpuVersion --- tensorflow/compiler/xla/BUILD | 1 + tensorflow/compiler/xla/types.h | 1 + 2 files changed, 2 insertions(+) diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD index eeb598b165b..ba728af76cf 100644 --- a/tensorflow/compiler/xla/BUILD +++ b/tensorflow/compiler/xla/BUILD @@ -130,6 +130,7 @@ cc_library( deps = [ "//tensorflow/core:framework_lite", "//third_party/eigen3", + "@com_google_absl//absl/types:variant", ], ) diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h index 3b4e1aef08b..8b1e9942680 100644 --- a/tensorflow/compiler/xla/types.h +++ b/tensorflow/compiler/xla/types.h @@ -19,6 +19,7 @@ limitations under the License. #include #include +#include "absl/types/variant.h" #include "third_party/eigen3/Eigen/Core" #include "tensorflow/core/framework/numeric_types.h" #include "tensorflow/core/platform/types.h" From 2fb89483dc23ace1aec393525f466d6a4a2ed81d Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Tue, 9 Jul 2019 20:53:01 +0000 Subject: [PATCH 0149/3053] Extract common logic from LinkAndOptimizeModule. --- .../gpu/llvm_gpu_backend/nvptx_backend_lib.cc | 159 +++++++----------- 1 file changed, 60 insertions(+), 99 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc index 963719577be..cb9797a002f 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc @@ -333,14 +333,14 @@ Status LinkLibdeviceIfNecessary(llvm::Module* module, return LinkWithBitcodeVector(module, libdevice_path_vector); } -StatusOr> LinkAndOptimizeModule( - llvm::Module* module, std::pair compute_capability, - const HloModuleConfig& hlo_module_config, - const string& device_bitcode_dir_path) { +Status NVPTXTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version, + const HloModuleConfig& hlo_module_config, + const string& device_bitcode_dir_path) { // Link the input module with libdevice, to pull in implementations of some // builtins. - TF_RETURN_IF_ERROR(LinkLibdeviceIfNecessary(module, compute_capability, - device_bitcode_dir_path)); + TF_RETURN_IF_ERROR(LinkLibdeviceIfNecessary( + module, absl::get>(gpu_version), + device_bitcode_dir_path)); // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass // can access it. @@ -354,6 +354,36 @@ StatusOr> LinkAndOptimizeModule( } } + return Status::OK(); +} + +std::unique_ptr NVPTXGetTargetMachine( + llvm::Triple target_triple, GpuVersion gpu_version, + const HloModuleConfig&) { + // Figure out the exact name of the processor as known to the NVPTX backend + // from the gpu_architecture flag. + return GetTargetMachine( + target_triple, GetSmName(absl::get>(gpu_version)), + hlo_module_config, "+ptx60"); +} + +} // namespace nvptx + +namespace { +using TargetModuleLinker = std::function; +using GetLLVMTargetMachine = std::function( + llvm::Triple, GpuVersion, const HloModuleConfig&)>; + +StatusOr> LinkAndOptimizeModule( + llvm::Module* module, GpuVersion gpu_version, + const HloModuleConfig& hlo_module_config, + const string& device_bitcode_dir_path, TargetModuleLinker module_linker, + const string& default_target_triple, + GetLLVMTargetMachine get_llvm_target_machine, int inline_threshold) { + TF_RETURN_IF_ERROR(module_linker(module, gpu_version, hlo_module_config, + device_bitcode_dir_path)); + IrDumpingPassManager module_passes(module->getModuleIdentifier(), "", false); // Add an appropriate TargetLibraryInfo pass for the module's triple. @@ -367,14 +397,11 @@ StatusOr> LinkAndOptimizeModule( llvm::Triple target_triple = llvm::Triple(module->getTargetTriple()); if (target_triple.getArch() == llvm::Triple::UnknownArch) { LOG(WARNING) << "target triple not found in the module"; - target_triple = llvm::Triple("nvptx64-unknown-unknown"); + target_triple = llvm::Triple(default_target_triple); } - // Figure out the exact name of the processor as known to the NVPTX backend - // from the gpu_architecture flag. std::unique_ptr target_machine = - GetTargetMachine(target_triple, GetSmName(compute_capability), - hlo_module_config, "+ptx60"); + get_llvm_target_machine(target_triple, gpu_version, hlo_module_config); module_passes.add(llvm::createTargetTransformInfoWrapperPass( target_machine->getTargetIRAnalysis())); @@ -405,7 +432,7 @@ StatusOr> LinkAndOptimizeModule( // Add optimization passes, and set inliner threshold. AddOptimizationPasses(opt_level, /*size_level=*/0, target_machine.get(), &module_passes, - &function_passes, kDefaultInlineThreshold); + &function_passes, inline_threshold); // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA // again after the standard optimization passes [http://b/13329423]. @@ -435,6 +462,9 @@ StatusOr> LinkAndOptimizeModule( return std::move(target_machine); } +} // namespace + +namespace nvptx { // One-time module initializer. // Must be called only once -- DO NOT CALL DIRECTLY. void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) { @@ -481,8 +511,7 @@ void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) { InitializePasses(registry); } -StatusOr CompileToPtx(llvm::Module* module, - std::pair compute_capability, +StatusOr CompileToPtx(llvm::Module* module, GpuVersion gpu_version, const HloModuleConfig& hlo_module_config, const string& libdevice_dir_path) { static std::once_flag backend_init_flag; @@ -506,8 +535,10 @@ StatusOr CompileToPtx(llvm::Module* module, TF_ASSIGN_OR_RETURN( target_machine, - LinkAndOptimizeModule(module, compute_capability, hlo_module_config, - libdevice_dir_path)); + LinkAndOptimizeModule(module, gpu_version, hlo_module_config, + libdevice_dir_path, NVPTXTargetModuleLinker, + "nvptx64-unknown-unknown", NVPTXGetTargetMachine, + kDefaultInlineThreshold)); TF_ASSIGN_OR_RETURN(ptx, EmitModuleToPTX(module, target_machine.get())); } return ptx; @@ -555,92 +586,22 @@ Status LinkROCDLIfNecessary(llvm::Module* module, int amdgpu_version, GetROCDLPaths(amdgpu_version, rocdl_dir_path)); } -StatusOr> LinkAndOptimizeModule( - llvm::Module* module, int amdgpu_version, - const HloModuleConfig& hlo_module_config, - const string& device_bitcode_dir_path) { +Status AMDGPUTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version, + const HloModuleConfig& hlo_module_config, + const string& device_bitcode_dir_path) { // Link the input module with ROCDL. - TF_RETURN_IF_ERROR(amdgpu::LinkROCDLIfNecessary(module, amdgpu_version, - device_bitcode_dir_path)); + TF_RETURN_IF_ERROR(amdgpu::LinkROCDLIfNecessary( + module, absl::get(gpu_version), device_bitcode_dir_path)); - IrDumpingPassManager module_passes(module->getModuleIdentifier(), "", false); + return Status::OK(); +} - // Add an appropriate TargetLibraryInfo pass for the module's triple. - llvm::TargetLibraryInfoWrapperPass* tliwp = - new llvm::TargetLibraryInfoWrapperPass( - llvm::Triple(module->getTargetTriple())); - module_passes.add(tliwp); - - // Try to fetch the target triple from the module. If not present, set a - // default target triple. - llvm::Triple target_triple = llvm::Triple(module->getTargetTriple()); - if (target_triple.getArch() == llvm::Triple::UnknownArch) { - LOG(WARNING) << "target triple not found in the module"; - target_triple = llvm::Triple("amdgcn--amdhsa-amdgiz"); - } - - // Construct LLVM TargetMachine. - std::unique_ptr target_machine = GetTargetMachine( - target_triple, absl::StrCat("gfx", amdgpu_version), - hlo_module_config, "-code-object-v3"); - - module_passes.add(llvm::createTargetTransformInfoWrapperPass( - target_machine->getTargetIRAnalysis())); - - // The LLVM IR verifier performs sanity checking on the IR. This helps - // discover problems and report them in a meaningful manner, rather than let - // later passes report obscure assertions because of unfulfilled invariants. - module_passes.add(llvm::createVerifierPass()); - - // Create the function-level pass manager. It needs data layout information - // too. - llvm::legacy::FunctionPassManager function_passes(module); - - int32 opt_level = - hlo_module_config.debug_options().xla_backend_optimization_level(); - - if (opt_level < 2) { - LOG(ERROR) << std::string(80, '*'); - LOG(ERROR) << "The XLA GPU backend doesn't support unoptimized code " - "generation but "; - LOG(ERROR) << "--xla_backend_optimization_level is set to " << opt_level - << "!"; - LOG(ERROR) << "(Supported configuration is " - "--xla_backend_optimization_level >= 2.)"; - LOG(ERROR) << std::string(80, '*'); - } - - // Add optimization passes, and set inliner threshold. - AddOptimizationPasses(opt_level, - /*size_level=*/0, target_machine.get(), &module_passes, - &function_passes, kAMDGPUInlineThreshold); - - // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA - // again after the standard optimization passes [http://b/13329423]. - // TODO(jingyue): SROA may further expose more optimization opportunities such - // as more precise alias analysis and more function inlining (SROA may change - // the inlining cost of a function). For now, running SROA already emits good - // enough code for the evaluated benchmarks. We may want to run more - // optimizations later. - if (opt_level > 0) { - // LLVM's optimizer turns on SROA when the optimization level is greater - // than 0. We mimic this behavior here. - module_passes.add(llvm::createSROAPass()); - } - - // Verify that the module is well formed after optimizations ran. - module_passes.add(llvm::createVerifierPass()); - - // Done populating the pass managers. Now run them. - - function_passes.doInitialization(); - for (auto func = module->begin(); func != module->end(); ++func) { - function_passes.run(*func); - } - function_passes.doFinalization(); - module_passes.run(*module); - - return std::move(target_machine); +std::unique_ptr AMDGPUGetTargetMachine( + llvm::Triple target_triple, GpuVersion gpu_version, + const HloModuleConfig&) { + return std::move(GetTargetMachine( + target_triple, absl::StrCat("gfx", absl::get(gpu_version)), + hlo_module_config, "-code-object-v3")); } } // namespace amdgpu From 87d2f0e7af14526e0e9910b51b6b0ee69396fcbb Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Tue, 9 Jul 2019 16:35:45 -0500 Subject: [PATCH 0150/3053] Fix build errors. --- .../xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc | 4 ++-- .../xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h | 3 +-- tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc | 7 ++++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc index cb9797a002f..188997293c7 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc @@ -359,7 +359,7 @@ Status NVPTXTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version, std::unique_ptr NVPTXGetTargetMachine( llvm::Triple target_triple, GpuVersion gpu_version, - const HloModuleConfig&) { + const HloModuleConfig& hlo_module_config) { // Figure out the exact name of the processor as known to the NVPTX backend // from the gpu_architecture flag. return GetTargetMachine( @@ -598,7 +598,7 @@ Status AMDGPUTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version, std::unique_ptr AMDGPUGetTargetMachine( llvm::Triple target_triple, GpuVersion gpu_version, - const HloModuleConfig&) { + const HloModuleConfig& hlo_module_config) { return std::move(GetTargetMachine( target_triple, absl::StrCat("gfx", absl::get(gpu_version)), hlo_module_config, "-code-object-v3")); diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h index d1528dd3604..825bb11344f 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h @@ -37,8 +37,7 @@ namespace nvptx { // The Compile.* interfaces each create their own llvm::LLVMContext objects for // thread safety, but note that LLVM's multithreaded support is very // preliminary; multithreaded use is not recommended at this time. -StatusOr CompileToPtx(llvm::Module* module, - std::pair compute_capability, +StatusOr CompileToPtx(llvm::Module* module, GpuVersion gpu_version, const HloModuleConfig& hlo_module_config, const string& libdevice_dir_path); } // namespace nvptx diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc index 86915d9bce6..29f122c0f81 100644 --- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc @@ -628,9 +628,10 @@ StatusOr> NVPTXCompiler::RunBackend( string ptx; { XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend - CompileToPtx"); - TF_ASSIGN_OR_RETURN(ptx, - nvptx::CompileToPtx(&llvm_module, {cc_major, cc_minor}, - module->config(), libdevice_dir)); + TF_ASSIGN_OR_RETURN( + ptx, nvptx::CompileToPtx(&llvm_module, + std::pair{cc_major, cc_minor}, + module->config(), libdevice_dir)); } llvm_ir::DumpIrIfEnabled(*module, llvm_module, /*optimized=*/true); From 9042c058762661dc457cb333686283de3700bf17 Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Mon, 15 Jul 2019 16:18:22 +0000 Subject: [PATCH 0151/3053] Address code review comments. - Move all utility functions into anonymous namespace. - Refactor signature of LinkAndOptimizeModule. LLVMGetTargetMachine is invoked outside of it. - Add checks for GpuVersion. --- .../gpu/llvm_gpu_backend/nvptx_backend_lib.cc | 101 ++++++++---------- 1 file changed, 43 insertions(+), 58 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc index 188997293c7..0974f25ce52 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc @@ -64,15 +64,11 @@ limitations under the License. namespace xla { namespace gpu { -namespace amdgpu { +namespace { // Inline threshold value to use in LLVM AMDGPU backend. const int kAMDGPUInlineThreshold = 0x100000; -} // namespace amdgpu - -namespace nvptx { - // Default inline threshold value to use in llvm. const int kDefaultInlineThreshold = 1100; @@ -105,10 +101,6 @@ static string GetSmName(std::pair compute_capability) { return absl::StrCat("sm_", sm_version); } -} // namespace nvptx - -namespace { - // Convenience function for producing a name of a temporary compilation product // from the input filename. string MakeNameForTempProduct(absl::string_view input_filename, @@ -216,9 +208,6 @@ void EmitBitcodeToFile(const Module& module, absl::string_view filename) { outfile.keep(); } -} // namespace - -namespace nvptx { // Emits the given module to PTX. target_machine is an initialized TargetMachine // for the NVPTX target. StatusOr EmitModuleToPTX(Module* module, @@ -243,9 +232,6 @@ StatusOr EmitModuleToPTX(Module* module, return ptx; } -} // namespace nvptx - -namespace { // LLVM has an extensive flags mechanism of its own, which is only accessible // through the command line. Internal libraries within LLVM register parsers for // flags, with no other way to configure them except pass these flags. @@ -305,10 +291,6 @@ Status LinkWithBitcodeVector(llvm::Module* module, return Status::OK(); } -} // namespace - -namespace nvptx { - // Links libdevice into the given module if the module needs libdevice. Status LinkLibdeviceIfNecessary(llvm::Module* module, std::pair compute_capability, @@ -358,29 +340,24 @@ Status NVPTXTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version, } std::unique_ptr NVPTXGetTargetMachine( - llvm::Triple target_triple, GpuVersion gpu_version, + llvm::Triple target_triple, std::pair compute_capability, const HloModuleConfig& hlo_module_config) { // Figure out the exact name of the processor as known to the NVPTX backend // from the gpu_architecture flag. - return GetTargetMachine( - target_triple, GetSmName(absl::get>(gpu_version)), - hlo_module_config, "+ptx60"); + return GetTargetMachine(target_triple, GetSmName(compute_capability), + hlo_module_config, "+ptx60"); } -} // namespace nvptx - -namespace { using TargetModuleLinker = std::function; -using GetLLVMTargetMachine = std::function( - llvm::Triple, GpuVersion, const HloModuleConfig&)>; -StatusOr> LinkAndOptimizeModule( - llvm::Module* module, GpuVersion gpu_version, - const HloModuleConfig& hlo_module_config, - const string& device_bitcode_dir_path, TargetModuleLinker module_linker, - const string& default_target_triple, - GetLLVMTargetMachine get_llvm_target_machine, int inline_threshold) { +Status LinkAndOptimizeModule(llvm::Module* module, GpuVersion gpu_version, + const HloModuleConfig& hlo_module_config, + const string& device_bitcode_dir_path, + TargetModuleLinker module_linker, + llvm::Triple default_target_triple, + llvm::TargetMachine* target_machine, + int inline_threshold) { TF_RETURN_IF_ERROR(module_linker(module, gpu_version, hlo_module_config, device_bitcode_dir_path)); @@ -397,12 +374,9 @@ StatusOr> LinkAndOptimizeModule( llvm::Triple target_triple = llvm::Triple(module->getTargetTriple()); if (target_triple.getArch() == llvm::Triple::UnknownArch) { LOG(WARNING) << "target triple not found in the module"; - target_triple = llvm::Triple(default_target_triple); + target_triple = default_target_triple; } - std::unique_ptr target_machine = - get_llvm_target_machine(target_triple, gpu_version, hlo_module_config); - module_passes.add(llvm::createTargetTransformInfoWrapperPass( target_machine->getTargetIRAnalysis())); @@ -431,7 +405,7 @@ StatusOr> LinkAndOptimizeModule( // Add optimization passes, and set inliner threshold. AddOptimizationPasses(opt_level, - /*size_level=*/0, target_machine.get(), &module_passes, + /*size_level=*/0, target_machine, &module_passes, &function_passes, inline_threshold); // Loop unrolling exposes more opportunities for SROA. Therefore, we run SROA @@ -459,12 +433,9 @@ StatusOr> LinkAndOptimizeModule( function_passes.doFinalization(); module_passes.run(*module); - return std::move(target_machine); + return Status::OK(); } -} // namespace - -namespace nvptx { // One-time module initializer. // Must be called only once -- DO NOT CALL DIRECTLY. void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) { @@ -511,6 +482,10 @@ void NVPTXBackendInit(const HloModuleConfig& hlo_module_config) { InitializePasses(registry); } +} // namespace + +namespace nvptx { + StatusOr CompileToPtx(llvm::Module* module, GpuVersion gpu_version, const HloModuleConfig& hlo_module_config, const string& libdevice_dir_path) { @@ -533,20 +508,30 @@ StatusOr CompileToPtx(llvm::Module* module, GpuVersion gpu_version, return string(); } - TF_ASSIGN_OR_RETURN( - target_machine, - LinkAndOptimizeModule(module, gpu_version, hlo_module_config, - libdevice_dir_path, NVPTXTargetModuleLinker, - "nvptx64-unknown-unknown", NVPTXGetTargetMachine, - kDefaultInlineThreshold)); - TF_ASSIGN_OR_RETURN(ptx, EmitModuleToPTX(module, target_machine.get())); + auto compute_capability = absl::get_if>(&gpu_version); + if (compute_capability) { + llvm::Triple target_triple("nvptx64-unknown-unknown"); + // Construct LLVM TargetMachine for NVPTX. + std::unique_ptr target_machine = + NVPTXGetTargetMachine(target_triple, *compute_capability, + hlo_module_config); + + // Link with libdeivce, and optimize the LLVM module. + TF_RETURN_IF_ERROR(LinkAndOptimizeModule( + module, gpu_version, hlo_module_config, libdevice_dir_path, + NVPTXTargetModuleLinker, target_triple, + target_machine.get(), kDefaultInlineThreshold)); + + // Lower optimize LLVM module to PTX. + TF_ASSIGN_OR_RETURN(ptx, EmitModuleToPTX(module, target_machine.get())); + } } return ptx; } } // namespace nvptx -namespace amdgpu { +namespace { // Gets the ROCm-Device-Libs filenames for a particular AMDGPU version. static std::vector GetROCDLPaths(int amdgpu_version, @@ -590,21 +575,21 @@ Status AMDGPUTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version, const HloModuleConfig& hlo_module_config, const string& device_bitcode_dir_path) { // Link the input module with ROCDL. - TF_RETURN_IF_ERROR(amdgpu::LinkROCDLIfNecessary( - module, absl::get(gpu_version), device_bitcode_dir_path)); + TF_RETURN_IF_ERROR(LinkROCDLIfNecessary(module, absl::get(gpu_version), + device_bitcode_dir_path)); return Status::OK(); } std::unique_ptr AMDGPUGetTargetMachine( - llvm::Triple target_triple, GpuVersion gpu_version, + llvm::Triple target_triple, int amdgpu_version, const HloModuleConfig& hlo_module_config) { - return std::move(GetTargetMachine( - target_triple, absl::StrCat("gfx", absl::get(gpu_version)), - hlo_module_config, "-code-object-v3")); + return std::move(GetTargetMachine(target_triple, + absl::StrCat("gfx", amdgpu_version), + hlo_module_config, "-code-object-v3")); } -} // namespace amdgpu +} // namespace } // namespace gpu } // namespace xla From 5f85ec39020eb9cf3e361e75ec8e130881381b53 Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Tue, 16 Jul 2019 09:48:10 -0500 Subject: [PATCH 0152/3053] Address code review comments. --- tensorflow/compiler/xla/BUILD | 1 - .../gpu/llvm_gpu_backend/nvptx_backend_lib.cc | 79 ++++++++++--------- tensorflow/compiler/xla/types.h | 1 - 3 files changed, 41 insertions(+), 40 deletions(-) diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD index ba728af76cf..eeb598b165b 100644 --- a/tensorflow/compiler/xla/BUILD +++ b/tensorflow/compiler/xla/BUILD @@ -130,7 +130,6 @@ cc_library( deps = [ "//tensorflow/core:framework_lite", "//third_party/eigen3", - "@com_google_absl//absl/types:variant", ], ) diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc index 0974f25ce52..b7870a98d31 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc @@ -63,7 +63,6 @@ limitations under the License. namespace xla { namespace gpu { - namespace { // Inline threshold value to use in LLVM AMDGPU backend. @@ -210,8 +209,7 @@ void EmitBitcodeToFile(const Module& module, absl::string_view filename) { // Emits the given module to PTX. target_machine is an initialized TargetMachine // for the NVPTX target. -StatusOr EmitModuleToPTX(Module* module, - llvm::TargetMachine* target_machine) { +string EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) { std::string ptx; // need a std::string instead of a ::string. { llvm::raw_string_ostream stream(ptx); @@ -269,9 +267,9 @@ Status LinkWithBitcodeVector(llvm::Module* module, for (auto& bitcode_path : bitcode_path_vector) { if (!tensorflow::Env::Default()->FileExists(bitcode_path).ok()) { - LOG(WARNING) << "bitcode module is required by this HLO module but was " - "not found at " - << bitcode_path; + LOG(ERROR) << "bitcode module is required by this HLO module but was " + "not found at " + << bitcode_path; return xla::InternalError("bitcode module not found at %s", bitcode_path); } @@ -311,8 +309,7 @@ Status LinkLibdeviceIfNecessary(llvm::Module* module, } VLOG(1) << "Linking with libdevice from: " << libdevice_path; - std::vector libdevice_path_vector{libdevice_path}; - return LinkWithBitcodeVector(module, libdevice_path_vector); + return LinkWithBitcodeVector(module, {libdevice_path}); } Status NVPTXTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version, @@ -320,9 +317,12 @@ Status NVPTXTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version, const string& device_bitcode_dir_path) { // Link the input module with libdevice, to pull in implementations of some // builtins. - TF_RETURN_IF_ERROR(LinkLibdeviceIfNecessary( - module, absl::get>(gpu_version), - device_bitcode_dir_path)); + auto compute_capability = absl::get_if>(&gpu_version); + if (!compute_capability) { + return xla::InternalError("Incompatible compute capability was specified."); + } + TF_RETURN_IF_ERROR(LinkLibdeviceIfNecessary(module, *compute_capability, + device_bitcode_dir_path)); // Set the flush-denormals-to-zero flag on the module so the NVVM reflect pass // can access it. @@ -509,22 +509,24 @@ StatusOr CompileToPtx(llvm::Module* module, GpuVersion gpu_version, } auto compute_capability = absl::get_if>(&gpu_version); - if (compute_capability) { - llvm::Triple target_triple("nvptx64-unknown-unknown"); - // Construct LLVM TargetMachine for NVPTX. - std::unique_ptr target_machine = - NVPTXGetTargetMachine(target_triple, *compute_capability, - hlo_module_config); - - // Link with libdeivce, and optimize the LLVM module. - TF_RETURN_IF_ERROR(LinkAndOptimizeModule( - module, gpu_version, hlo_module_config, libdevice_dir_path, - NVPTXTargetModuleLinker, target_triple, - target_machine.get(), kDefaultInlineThreshold)); - - // Lower optimize LLVM module to PTX. - TF_ASSIGN_OR_RETURN(ptx, EmitModuleToPTX(module, target_machine.get())); + if (!compute_capability) { + return xla::InternalError( + "Incompatible compute capability was specified."); } + + llvm::Triple default_target_triple("nvptx64-unknown-unknown"); + // Construct LLVM TargetMachine for NVPTX. + std::unique_ptr target_machine = NVPTXGetTargetMachine( + default_target_triple, *compute_capability, hlo_module_config); + + // Link with libdeivce, and optimize the LLVM module. + TF_RETURN_IF_ERROR(LinkAndOptimizeModule( + module, gpu_version, hlo_module_config, libdevice_dir_path, + NVPTXTargetModuleLinker, default_target_triple, target_machine.get(), + kDefaultInlineThreshold)); + + // Lower optimized LLVM module to PTX. + ptx = EmitModuleToPTX(module, target_machine.get()); } return ptx; } @@ -537,19 +539,15 @@ namespace { static std::vector GetROCDLPaths(int amdgpu_version, const string& rocdl_dir_path) { // AMDGPU version-neutral bitcodes. - std::vector rocdl_filename_vector{ - "hc.amdgcn.bc", - "opencl.amdgcn.bc", - "ocml.amdgcn.bc", - "ockl.amdgcn.bc", - "oclc_finite_only_off.amdgcn.bc", - "oclc_daz_opt_off.amdgcn.bc", - "oclc_correctly_rounded_sqrt_on.amdgcn.bc", - "oclc_unsafe_math_off.amdgcn.bc"}; + static std::vector* rocdl_filenames = new std::vector( + {"hc.amdgcn.bc", "opencl.amdgcn.bc", "ocml.amdgcn.bc", "ockl.amdgcn.bc", + "oclc_finite_only_off.amdgcn.bc", "oclc_daz_opt_off.amdgcn.bc", + "oclc_correctly_rounded_sqrt_on.amdgcn.bc", + "oclc_unsafe_math_off.amdgcn.bc"}); // Construct full path to ROCDL bitcode libraries. std::vector result; - for (auto& filename : rocdl_filename_vector) { + for (auto& filename : *rocdl_filenames) { result.push_back(tensorflow::io::JoinPath(rocdl_dir_path, filename)); } @@ -575,8 +573,13 @@ Status AMDGPUTargetModuleLinker(llvm::Module* module, GpuVersion gpu_version, const HloModuleConfig& hlo_module_config, const string& device_bitcode_dir_path) { // Link the input module with ROCDL. - TF_RETURN_IF_ERROR(LinkROCDLIfNecessary(module, absl::get(gpu_version), - device_bitcode_dir_path)); + auto amdgpu_version = absl::get_if(&gpu_version); + if (!amdgpu_version) { + return xla::InternalError( + "Incompatible AMD GCN ISA version was specified."); + } + TF_RETURN_IF_ERROR( + LinkROCDLIfNecessary(module, *amdgpu_version, device_bitcode_dir_path)); return Status::OK(); } diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h index 8b1e9942680..3b4e1aef08b 100644 --- a/tensorflow/compiler/xla/types.h +++ b/tensorflow/compiler/xla/types.h @@ -19,7 +19,6 @@ limitations under the License. #include #include -#include "absl/types/variant.h" #include "third_party/eigen3/Eigen/Core" #include "tensorflow/core/framework/numeric_types.h" #include "tensorflow/core/platform/types.h" From 9770648bd385639feeaebfbb9b38fb6da9d50914 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 19 Jun 2019 22:33:10 +0000 Subject: [PATCH 0153/3053] Fix the issue of tf.range where tensor with a different dtype is passed This fix tries to address the issue raised in 29867 where the following raises error: ``` tf.range(tf.constant(102), dtype=tf.float32) ... ... ValueError: Tensor conversion requested dtype float32 for Tensor with dtype int32: 'tf.Tensor(102, shape=(), dtype=int32)' ``` This is different from `tf.arange` where different types could be used: ``` np.arange(np.int(102), dtype=np.float32) ``` The issue is that in tf.range cast is only done when dtype is not passed explicitly. This fix adds additional processing so that the above scenario is covered. This fix fixes 29867. Signed-off-by: Yong Tang --- tensorflow/python/ops/math_ops.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 84372b3c922..114df461a8b 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -1349,9 +1349,28 @@ def range(start, limit=None, delta=1, dtype=None, name="range"): # pylint: disa start, limit = 0, start with ops.name_scope(name, "Range", [start, limit, delta]) as name: - start = ops.convert_to_tensor(start, dtype=dtype, name="start") - limit = ops.convert_to_tensor(limit, dtype=dtype, name="limit") - delta = ops.convert_to_tensor(delta, dtype=dtype, name="delta") + # In case start, limit, or delta is already a tensor and have different + # dtype with the specified dtype, try to do a cast to see if the dtype is + # compatible. Otherwise pass to convert_to_tensor. This is to handle + # the situation with: + # tf.range(tf.constant(5), dtype=tf.float32) + # which is comparable with: + # np.arange(np.int(5), dtype=np.float32) + if (isinstance(start, ops.Tensor) and + dtype is not None and dtype != start.dtype): + start = cast(start, dtype=dtype) + else: + start = ops.convert_to_tensor(start, dtype=dtype, name="start") + if (isinstance(limit, ops.Tensor) and + dtype is not None and dtype != limit.dtype): + limit = cast(limit, dtype=dtype) + else: + limit = ops.convert_to_tensor(limit, dtype=dtype, name="limit") + if (isinstance(delta, ops.Tensor) and + dtype is not None and dtype != delta.dtype): + delta = cast(delta, dtype=dtype) + else: + delta = ops.convert_to_tensor(delta, dtype=dtype, name="delta") # infer dtype if not explicitly provided if dtype is None: From 085160e48ace499de3ea2a58f3a4bd3c8cd07dc8 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 19 Jun 2019 22:36:03 +0000 Subject: [PATCH 0154/3053] Add test case for GitHub issue 29867, Signed-off-by: Yong Tang --- tensorflow/python/kernel_tests/init_ops_test.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py index 4b9681afd2c..d4b7d20f639 100644 --- a/tensorflow/python/kernel_tests/init_ops_test.py +++ b/tensorflow/python/kernel_tests/init_ops_test.py @@ -537,6 +537,14 @@ class RangeTest(test.TestCase): math_ops.range( 0, 0, 1, dtype=dtypes.float64).dtype, dtypes.float64) + def testMixedDType(self): + # Test case for GitHub issue 29867 + with self.cached_session(use_gpu=True): + tf_ans = math_ops.range(constant_op.constant(5), dtype=dtypes.float32) + self.assertAllEqual( + self.evaluate(tf_ans), + np.arange(np.int32(5), dtype=np.float32)) + # TODO(vrv): move to sequence_ops_test? class LinSpaceTest(test.TestCase): From 07ad62064c35d3c56377dea3fc23fabf14818146 Mon Sep 17 00:00:00 2001 From: Leslie-Fang Date: Fri, 19 Jul 2019 12:52:18 +0800 Subject: [PATCH 0155/3053] solve the tf.cast issue solve the issue https://github.com/tensorflow/tensorflow/issues/30215 --- tensorflow/core/grappler/optimizers/constant_folding.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc index 6b7ceff65b2..ae077af0a34 100644 --- a/tensorflow/core/grappler/optimizers/constant_folding.cc +++ b/tensorflow/core/grappler/optimizers/constant_folding.cc @@ -1209,7 +1209,7 @@ Status ConstantFolding::CreateNodeDef(const string& name, case DT_INT32: POPULATE_TENSOR_PROTO(tensor, t, int32, int); case DT_UINT32: - POPULATE_TENSOR_PROTO(tensor, t, uint32, int); + POPULATE_TENSOR_PROTO(tensor, t, uint32, uint32); case DT_INT16: POPULATE_TENSOR_PROTO(tensor, t, int16, int); case DT_UINT16: From a8c1d3f9ac3192bbd21d6440f49dbc099e3b2224 Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Fri, 19 Jul 2019 00:37:22 -0500 Subject: [PATCH 0156/3053] Fix bazel dependencies. --- tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD | 1 + .../xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h | 1 + 2 files changed, 2 insertions(+) diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD index 2f73fd0b3d4..91f66a2929c 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD @@ -30,6 +30,7 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/service:hlo_module_config", + "//tensorflow/compiler/xla/service/gpu:gpu_types", "//tensorflow/compiler/xla/service/llvm_ir:llvm_util", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h index 825bb11344f..f1f095d025e 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.h @@ -22,6 +22,7 @@ limitations under the License. #include "absl/strings/string_view.h" #include "llvm/IR/Module.h" +#include "tensorflow/compiler/xla/service/gpu/gpu_types.h" #include "tensorflow/compiler/xla/service/hlo_module_config.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" From 99d9f28d7fed9e229902c4a17fb8d2ae0175f0be Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Fri, 19 Jul 2019 06:43:24 -0500 Subject: [PATCH 0157/3053] Address code review comments. --- .../xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc index b7870a98d31..b6626d34144 100644 --- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc +++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc @@ -553,8 +553,8 @@ static std::vector GetROCDLPaths(int amdgpu_version, // Add AMDGPU version-specific bitcodes. result.push_back(tensorflow::io::JoinPath( - rocdl_dir_path, tensorflow::strings::StrCat( - "oclc_isa_version_", amdgpu_version, ".amdgcn.bc"))); + rocdl_dir_path, + absl::StrCat("oclc_isa_version_", amdgpu_version, ".amdgcn.bc"))); return std::move(result); } @@ -562,7 +562,7 @@ static std::vector GetROCDLPaths(int amdgpu_version, Status LinkROCDLIfNecessary(llvm::Module* module, int amdgpu_version, const string& rocdl_dir_path) { if (!CouldNeedDeviceBitcode(*module)) { - return tensorflow::Status::OK(); + return Status::OK(); } return LinkWithBitcodeVector(module, From e44642d3f0751f1a8bfb3ec2117ce81bebba0a1c Mon Sep 17 00:00:00 2001 From: "srinivasan.narayanamoorthy" Date: Fri, 19 Jul 2019 09:07:09 -0700 Subject: [PATCH 0158/3053] Parallelizing scatter update op. --- tensorflow/core/kernels/scatter_functor.h | 70 ++++++++++++++++++---- tensorflow/core/kernels/scatter_op_test.cc | 60 ++++++++++++++++++- 2 files changed, 116 insertions(+), 14 deletions(-) diff --git a/tensorflow/core/kernels/scatter_functor.h b/tensorflow/core/kernels/scatter_functor.h index 755f8f8dc55..bda819b272d 100644 --- a/tensorflow/core/kernels/scatter_functor.h +++ b/tensorflow/core/kernels/scatter_functor.h @@ -18,14 +18,15 @@ limitations under the License. #include -#include "third_party/eigen3/Eigen/Core" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/bounds_check.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/variant_op_registry.h" #include "tensorflow/core/kernels/dense_update_functor.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/work_sharder.h" +#include "third_party/eigen3/Eigen/Core" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { @@ -188,6 +189,7 @@ struct AssignSYCL { } // namespace scatter_op namespace functor { +#define kMaxLocks 1024 template struct ScatterFunctor { Index operator()(OpKernelContext* c, const Device& d, @@ -205,17 +207,61 @@ struct ScatterFunctorBase { // indices and params sizes were validated in DoCompute(). const Index N = static_cast(indices.size()); const Index limit = static_cast(params.dimension(0)); - for (Index i = 0; i < N; i++) { - // Grab the index and check its validity. Do this carefully, - // to avoid checking the value and grabbing it again from - // memory a second time (a security risk since it may change in between). - const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i)); - if (!FastBoundsCheck(index, limit)) return i; - // Copy last Ndim-1 dimensions of updates[i] to params[index] - scatter_op::internal::Assign::Run(params.template chip<0>(index), - updates.template chip<0>(i)); + unsigned long int num_locks, entries_per_lock; + // Duplicate entries need to be handled correctly. + // Multiple updates to the same index has to be serialized. + // To reduce the number of locks and the memory usage, + // we divide the whole index space into kMaxLocks regions + // with each lock serializing access to a region. + if (limit <= kMaxLocks) { + num_locks = limit; + entries_per_lock = 1; + + } else { + num_locks = kMaxLocks; + entries_per_lock = (limit % kMaxLocks == 0) ? limit / kMaxLocks + : (limit / kMaxLocks + 1); } - return -1; + + std::vector> accessed(num_locks); + auto ParallelInit = [&](Index start, Index end) { + for (Index i = start; i < end; i++) accessed.at(i) = false; + }; + Index bad_index = -1; + auto ParallelScatter = [&](Index start, Index end) { + for (Index i = start; i < end; i++) { + // Grab the index and check its validity. Do this carefully, + // to avoid checking the value and grabbing it again from + // memory a second time (a security risk since it may change in + // between). + const Index index = ::tensorflow::internal::SubtleMustCopy(indices(i)); + if (!FastBoundsCheck(index, limit)) { + bad_index = i; + return; + } + unsigned long int lock_id = + (entries_per_lock == 1) ? index : (index / entries_per_lock); + // Copy last Ndim-1 dimensions of updates[i] to params[index] + // Separating test from test and set to improve performance and reduce + // coherence overhead. + // Test + while (accessed.at(lock_id)) { + } + // Test and Set + while (accessed.at(lock_id).exchange(true)) { + } + scatter_op::internal::Assign::Run(params.template chip<0>(index), + updates.template chip<0>(i)); + accessed.at(lock_id) = false; + } + }; + const DeviceBase::CpuWorkerThreads& worker_threads = + *(c->device()->tensorflow_cpu_worker_threads()); + Shard(worker_threads.num_threads, worker_threads.workers, num_locks, 3500.0, + ParallelInit); // Cost is arbitrary for now. + Shard(worker_threads.num_threads, worker_threads.workers, N, 3500.0, + ParallelScatter); // Cost is arbitrary for now. + return bad_index; } }; diff --git a/tensorflow/core/kernels/scatter_op_test.cc b/tensorflow/core/kernels/scatter_op_test.cc index ae6548e9ef2..2f4382758a7 100644 --- a/tensorflow/core/kernels/scatter_op_test.cc +++ b/tensorflow/core/kernels/scatter_op_test.cc @@ -47,6 +47,17 @@ class ScatterUpdateOpTest : public OpsTestBase { TF_ASSERT_OK(InitOp()); } }; +class ScatterSubOpTest : public OpsTestBase { + protected: + void MakeOp(DataType variable_ref_type, DataType index_type) { + TF_ASSERT_OK(NodeDefBuilder("myop", "ScatterSub") + .Input(FakeInput(variable_ref_type)) + .Input(FakeInput(index_type)) + .Input(FakeInput(RemoveRefType(variable_ref_type))) + .Finalize(node_def())); + TF_ASSERT_OK(InitOp()); + } +}; TEST_F(ScatterUpdateOpTest, Simple_StringType) { MakeOp(DT_STRING_REF, DT_INT32); @@ -175,6 +186,47 @@ TEST_F(ScatterUpdateOpTest, Error_IndexOutOfRange) { << s; } +TEST_F(ScatterSubOpTest, Error_IndexOutOfRange) { + MakeOp(DT_FLOAT_REF, DT_INT32); + // Feed and run + AddInputFromArray(TensorShape({14}), + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}); + AddInputFromArray(TensorShape({3}), {0, 1, 99}); + AddInputFromArray(TensorShape({3}), {100, 101, 102}); + Status s = RunOpKernel(); + EXPECT_TRUE( + absl::StrContains(s.ToString(), "indices[2] = 99 is not in [0, 14)")) + << s; +} + +TEST_F(ScatterSubOpTest, StressIndexTest) { + MakeOp(DT_INT32_REF, DT_INT32); + // Feed and run + const int kRows = 1; + std::vector values; + values.reserve(kRows); + for (int i = 0; i < kRows; i++) { + values.push_back(0); + } + const int kNumUpdates = 1000000; + std::vector indices; + std::vector updates; + for (int i = 0; i < kNumUpdates; i++) { + indices.push_back(0); + updates.push_back(1); + } + + AddInputFromArray(TensorShape({kRows}), values); + AddInputFromArray(TensorShape({kNumUpdates}), indices); + AddInputFromArray(TensorShape({kNumUpdates}), updates); + testing::ItemsProcessed((static_cast(kNumUpdates))); + Status s = RunOpKernel(); + Tensor params_tensor = *mutable_input(0).tensor; + Tensor expected(allocator(), DT_INT32, TensorShape({1})); + test::FillValues(&expected, {-1000000}); + test::ExpectTensorEqual(expected, params_tensor); +} + TEST_F(ScatterUpdateOpTest, Error_WrongDimsIndices) { MakeOp(DT_FLOAT_REF, DT_INT32); @@ -238,7 +290,8 @@ class ScatterUpdateBM : public ScatterUpdateOpTest { }; template -static void BM_ScatterHelper(int iters, int embedding_size, const char* op) { +static void BM_ScatterHelper(int iters, int embedding_size, const char* op, + bool big_num_updates = false) { testing::StopTiming(); const int kRows = 10000000 / embedding_size; std::vector values; @@ -246,7 +299,7 @@ static void BM_ScatterHelper(int iters, int embedding_size, const char* op) { for (int i = 0; i < kRows * embedding_size; i++) { values.push_back(i); } - const int kNumUpdates = 1000; + const int kNumUpdates = big_num_updates ? 1000000 : 1000; random::PhiloxRandom philox(301, 17); random::SimplePhilox rnd(&philox); std::vector indices; @@ -282,7 +335,9 @@ static void BM_ScatterUpdateInt64(int iters, int embedding_size) { static void BM_ScatterAddInt32(int iters, int embedding_size) { BM_ScatterHelper(iters, embedding_size, "ScatterAdd"); + BM_ScatterHelper(iters, embedding_size, "ScatterAdd", true); } + static void BM_ScatterAddInt64(int iters, int embedding_size) { BM_ScatterHelper(iters, embedding_size, "ScatterAdd"); } @@ -339,6 +394,7 @@ BENCHMARK(BM_ScatterUpdateInt64) ->Arg(100000); BENCHMARK(BM_ScatterAddInt32)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024); + BENCHMARK(BM_ScatterAddInt64)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024); BENCHMARK(BM_ScatterMulInt32)->Arg(1)->Arg(10)->Arg(64)->Arg(256)->Arg(1024); From 6614ad4b05556a0c7b908a73d44e54c430b49362 Mon Sep 17 00:00:00 2001 From: Bhavani Subramanian Date: Thu, 18 Jul 2019 15:09:13 -0700 Subject: [PATCH 0159/3053] Addressed review comments for 'mkl_util.h'. --- tensorflow/core/util/mkl_util.h | 78 +++++++++++++++------------------ 1 file changed, 35 insertions(+), 43 deletions(-) diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index f37f3b8a4b7..39df695699c 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -124,6 +124,15 @@ static const int kSmallBatchSize = 32; #ifdef ENABLE_MKLDNN_V1 #define ENGINE_CPU engine::kind::cpu +#define GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr) mem_ptr->get_desc() +#define GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(mem_ptr) \ + GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr) +#define MEMORY_CONSTRUCTOR(mem_desc, cpu_engine, data) \ + memory(mem_desc, cpu_engine, data) +#define MEMORY_CONSTRUCTOR_WITH_MEM_PD(mem_ptr, cpu_engine, data) \ + memory(GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr), cpu_engine, data) +#define MEMORY_CONSTRUCTOR_WITHOUT_DATA(mem_desc, cpu_engine) \ + memory(mem_desc, cpu_engine) #define MEMORY_FORMAT memory::format_tag #define MKL_TENSOR_FORMAT MklTensorFormat #define MKL_TENSOR_FORMAT_BLOCKED MklTensorFormat::FORMAT_BLOCKED @@ -139,6 +148,14 @@ static const int kSmallBatchSize = 32; #define TENSOR_FORMAT_NHWC MKL_TENSOR_FORMAT_NHWC #else #define ENGINE_CPU engine::cpu +#define GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr) \ + mem_ptr->get_primitive_desc().desc() +#define GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(mem_ptr) \ + mem_ptr->get_primitive_desc() +#define MEMORY_CONSTRUCTOR(mem_pd, cpu_engine, data) memory(mem_pd, data) +#define MEMORY_CONSTRUCTOR_WITH_MEM_PD(mem_ptr, cpu_engine, data) \ + memory({GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr), cpu_engine}, data) +#define MEMORY_CONSTRUCTOR_WITHOUT_DATA(mem_pd, cpu_engine) memory(mem_pd) #define MEMORY_FORMAT memory::format #define MKL_TENSOR_FORMAT memory::format #define MKL_TENSOR_FORMAT_BLOCKED memory::format::blocked @@ -633,9 +650,6 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor, &output_tensor)); engine cpu_engine(ENGINE_CPU, 0); -#ifdef ENABLE_MKLDNN_V1 - stream cpu_stream(cpu_engine); -#endif // ENABLE_MKLDNN_V1 MklDnnData input(&cpu_engine); // Get MKL layout of input tensor. @@ -655,6 +669,7 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor, DCHECK(input.CheckReorderToOpMem(output_tf_md, &output_tensor, net, net_args, &cpu_engine)); DCHECK_EQ(net.size(), net_args.size()); + stream cpu_stream(cpu_engine); for (size_t i = 0; i < net.size(); ++i) { net.at(i).execute(cpu_stream, net_args.at(i)); } @@ -1308,17 +1323,9 @@ class MklDnnData { if (user_memory_) delete user_memory_; // TODO(nhasabni): can we remove dynamic memory allocation? if (data_buffer) { -#ifdef ENABLE_MKLDNN_V1 - user_memory_ = new memory(pd, *cpu_engine_, data_buffer); -#else - user_memory_ = new memory(pd, data_buffer); -#endif // ENABLE_MKLDNN_V1 + user_memory_ = new MEMORY_CONSTRUCTOR(pd, *cpu_engine_, data_buffer); } else { -#ifdef ENABLE_MKLDNN_V1 - user_memory_ = new memory(pd, *cpu_engine_); -#else - user_memory_ = new memory(pd); -#endif // ENABLE_MKLDNN_V1 + user_memory_ = new MEMORY_CONSTRUCTOR_WITHOUT_DATA(pd, *cpu_engine_); } } @@ -1415,11 +1422,7 @@ class MklDnnData { /// @return: true in case reorder of input is needed; false, otherwise. inline bool IsReorderNeeded(const MEMORY_PRIMITIVE_DESC& op_pd) const { DCHECK(user_memory_); -#ifdef ENABLE_MKLDNN_V1 - return op_pd != user_memory_->get_desc(); -#else - return op_pd != user_memory_->get_primitive_desc(); -#endif // ENABLE_MKLDNN_V1 + return op_pd != GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(user_memory_); } #ifndef ENABLE_MKLDNN_V1 @@ -1665,12 +1668,9 @@ class MklDnnData { inline bool PrepareReorderToUserMemIfReq(const MEMORY_PRIMITIVE_DESC& op_pd) { DCHECK(user_memory_); if (IsReorderNeeded(op_pd)) { -// TODO(nhasabni): can we remove dynamic memory allocation? -#ifdef ENABLE_MKLDNN_V1 - reorder_memory_ = new memory(op_pd, *cpu_engine_); -#else - reorder_memory_ = new memory(op_pd); -#endif // ENABLE_MKLDNN_V1 + // TODO(nhasabni): can we remove dynamic memory allocation? + reorder_memory_ = + new MEMORY_CONSTRUCTOR_WITHOUT_DATA(op_pd, *cpu_engine_); return true; } return false; @@ -1965,18 +1965,10 @@ class MklReorderPrimitive : public MklPrimitive { engine cpu_engine_ = engine(ENGINE_CPU, 0); void Setup(const memory* from, const memory* to) { - context_.src_mem.reset(new memory( -#ifdef ENABLE_MKLDNN_V1 - from->get_desc(), cpu_engine_, DummyData)); -#else - {from->get_primitive_desc().desc(), cpu_engine_}, DummyData)); -#endif // ENABLE_MKLDNN_V1 - context_.dst_mem.reset(new memory( -#ifdef ENABLE_MKLDNN_V1 - to->get_desc(), cpu_engine_, DummyData)); -#else - {to->get_primitive_desc().desc(), cpu_engine_}, DummyData)); -#endif // ENABLE_MKLDNN_V1 + context_.src_mem.reset( + new MEMORY_CONSTRUCTOR_WITH_MEM_PD(from, cpu_engine_, DummyData)); + context_.dst_mem.reset( + new MEMORY_CONSTRUCTOR_WITH_MEM_PD(to, cpu_engine_, DummyData)); context_.reorder_prim = std::make_shared( reorder(*context_.src_mem, *context_.dst_mem)); } @@ -2009,13 +2001,8 @@ class MklReorderPrimitiveFactory : public MklPrimitiveFactory { static string CreateKey(const memory* from, const memory* to) { string prefix = "reorder"; FactoryKeyCreator key_creator; -#ifdef ENABLE_MKLDNN_V1 - auto const& from_desc = from->get_desc().data; - auto const& to_desc = to->get_desc().data; -#else - auto const& from_desc = from->get_primitive_desc().desc().data; - auto const& to_desc = to->get_primitive_desc().desc().data; -#endif // ENABLE_MKLDNN_V1 + auto const& from_desc = GET_MEMORY_DESC_FROM_MEM_PTR(from).data; + auto const& to_desc = GET_MEMORY_DESC_FROM_MEM_PTR(to).data; const int KIdxFirstStride = 0; memory::dims from_dims(from_desc.dims, &from_desc.dims[from_desc.ndims]); memory::dims to_dims(to_desc.dims, &to_desc.dims[to_desc.ndims]); @@ -2089,6 +2076,11 @@ inline bool IsConv1x1StrideNot1(memory::dims filter_dims, } #undef ENGINE_CPU +#undef GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr) +#undef GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(mem_ptr) +#undef MEMORY_CONSTRUCTOR(mem_desc, cpu_engine, data) +#undef MEMORY_CONSTRUCTOR_WITH_MEM_PD(mem_ptr, cpu_engine, data) +#undef MEMORY_CONSTRUCTOR_WITHOUT_DATA(mem_desc, cpu_engine) #undef MEMORY_FORMAT #undef MKL_TENSOR_FORMAT #undef MKL_TENSOR_FORMAT_BLOCKED From 1094e3e84c479a7c9a2e50a16bb1604a0eadbb19 Mon Sep 17 00:00:00 2001 From: "jojimon.varghese" Date: Fri, 19 Jul 2019 11:00:40 -0700 Subject: [PATCH 0160/3053] Fix for unit test failure --- tensorflow/core/graph/mkl_layout_pass_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc index 494abbd5170..0f1053ae3f2 100644 --- a/tensorflow/core/graph/mkl_layout_pass_test.cc +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -3683,7 +3683,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_DeviceTest) { kGPUDevice); EXPECT_EQ(DoMklLayoutOptimizationPass(), "A(Input);B(Input);C(Input);D(Input);E(Input);" - "F(FusedBatchNormV3);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;" + "F(FusedBatchNorm);G(Zeta)|A->F;A->G;B->F:1;C->F:2;D->F:3;" "E->F:4;F->G:1"); } From 2cfffc875628d0947e6b3daf9f50e2fee7ba5baf Mon Sep 17 00:00:00 2001 From: Karthik Muthuraman Date: Fri, 19 Jul 2019 11:02:14 -0700 Subject: [PATCH 0161/3053] Add function reciprocal_no_nan() --- tensorflow/python/ops/math_ops.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 84372b3c922..139d61e18bc 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -4003,3 +4003,10 @@ def polyval(coeffs, x, name=None): for c in coeffs[1:]: p = c + p * x return p + +@tf_export("math.reciprocal_no_nan", "reciprocal_no_nan") +def reciprocal_no_nan(x, name=None): + with ops.name_scope(name, "reciprocal_no_nan", [x]) as scope: + x = ops.convert_to_tensor(x, name="x") + one = constant_ops.constant(1, dtype=x.dtype, name="one") + return gen_math_ops.div_no_nan(one, x, name=scope) \ No newline at end of file From e08e7bc8e5dee69c48bbcf7d41c02f9c1a095a08 Mon Sep 17 00:00:00 2001 From: Karthik Muthuraman Date: Fri, 19 Jul 2019 11:02:45 -0700 Subject: [PATCH 0162/3053] Add docstring for reciprocal_no_nan(). --- tensorflow/python/ops/math_ops.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 139d61e18bc..96d78605def 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -4006,6 +4006,28 @@ def polyval(coeffs, x, name=None): @tf_export("math.reciprocal_no_nan", "reciprocal_no_nan") def reciprocal_no_nan(x, name=None): + """Performs a safe reciprocal operation, element wise. + If a particular element is zero, the reciprocal for that element is + also set to zero. + + For example: + ```python + x = tf.constant([2.0, 0.5, 0, 1], dtype=tf.float32) + tf.math.reciprocal_no_nan(x) # [ 0.5, 2, 0.0, 1.0 ] + ``` + + Args: + x: A `Tensor` of type `float16`, `float32`, `float64` + `complex64` or `complex128`. + name: A name for the operation (optional). + + Returns: + A `Tensor` of same shape and type as `x`. + + Raises: + TypeError: x must be of a valid dtype. + + """ with ops.name_scope(name, "reciprocal_no_nan", [x]) as scope: x = ops.convert_to_tensor(x, name="x") one = constant_ops.constant(1, dtype=x.dtype, name="one") From fbe76c092a1cce865973d84502032d4b827ccfed Mon Sep 17 00:00:00 2001 From: Karthik Muthuraman Date: Fri, 19 Jul 2019 11:06:23 -0700 Subject: [PATCH 0163/3053] Add exception handling for reciprocal_no_nan() --- tensorflow/python/ops/math_ops.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 96d78605def..807c64c1991 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -4006,8 +4006,8 @@ def polyval(coeffs, x, name=None): @tf_export("math.reciprocal_no_nan", "reciprocal_no_nan") def reciprocal_no_nan(x, name=None): - """Performs a safe reciprocal operation, element wise. - If a particular element is zero, the reciprocal for that element is + """Performs a safe reciprocal operation, element wise. + If a particular element is zero, the reciprocal for that element is also set to zero. For example: @@ -4023,12 +4023,17 @@ def reciprocal_no_nan(x, name=None): Returns: A `Tensor` of same shape and type as `x`. - - Raises: + + Raises: TypeError: x must be of a valid dtype. """ + allowed_dtypes = [dtypes.float16, dtypes.float32, dtypes.float64, + dtypes.complex64, dtypes.complex128] with ops.name_scope(name, "reciprocal_no_nan", [x]) as scope: x = ops.convert_to_tensor(x, name="x") - one = constant_ops.constant(1, dtype=x.dtype, name="one") - return gen_math_ops.div_no_nan(one, x, name=scope) \ No newline at end of file + if x.dtype.base_dtype not in allowed_dtypes: + raise TypeError("x has incorrect data type: {} \n " + "Expected: {}".format(x.dtype.name, allowed_dtypes)) + one = constant_op.constant(1, dtype=x.dtype.base_dtype, name="one") + return gen_math_ops.div_no_nan(one, x, name=scope) From 0460ac8248b4dce8166fd870a40f9f1d6e5a3911 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 19 Jul 2019 11:06:12 -0700 Subject: [PATCH 0164/3053] BUILD file changes only. PiperOrigin-RevId: 259001550 --- tensorflow/tools/api/golden/BUILD | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tensorflow/tools/api/golden/BUILD b/tensorflow/tools/api/golden/BUILD index 9166a18c0a8..5c2a24c0669 100644 --- a/tensorflow/tools/api/golden/BUILD +++ b/tensorflow/tools/api/golden/BUILD @@ -1,10 +1,7 @@ # TensorFlow API backwards compatibility test goldens. package( - default_visibility = [ - "//tensorflow:tensorflow_py:__subpackages__", - "//tensorflow/tools/api:__subpackages__", - ], + default_visibility = ["//visibility:public"], licenses = ["notice"], # Apache 2.0 ) From c5028649857257c4e7779f4b94a8bfc28f435f02 Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Fri, 19 Jul 2019 11:31:32 -0700 Subject: [PATCH 0165/3053] Store resource inputs in set instead of all inputs PiperOrigin-RevId: 259006371 --- .../python/framework/auto_control_deps.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py index 38f1926ac12..2e656857e87 100644 --- a/tensorflow/python/framework/auto_control_deps.py +++ b/tensorflow/python/framework/auto_control_deps.py @@ -302,15 +302,20 @@ class AutomaticControlDependencies(object): last_op_using_resource_tensor[inp] = op ops_which_must_run = set([op]) continue - found_resource = False + + resource_inputs = set() # Check for any resource inputs. If we find any, we update control_inputs - # and last_op_using_resource_tensor. Note that we dedup op.inputs in case - # op receives the same resource tensor twice as input, which would result - # in op getting a control dependency on itself. - for inp in set(op.inputs): + # and last_op_using_resource_tensor. + for inp in op.inputs: if inp.dtype != dtypes_module.resource: continue - found_resource = True + + # If the op receives the same resource tensor twice as an input, we skip + # to avoid the op getting a control dependency on itself. + if id(inp) in resource_inputs: + continue + + resource_inputs.add(id(inp)) # Deal with switches, finally. if inp.op.type == "Switch": self._process_switch(inp.op, ops_which_must_run, @@ -325,7 +330,8 @@ class AutomaticControlDependencies(object): if inp in merge_for_resource: merge_for_resource[inp]._add_control_input(op) # pylint: disable=protected-access last_op_using_resource_tensor[inp] = op - if (op_is_stateful(op) and not found_resource + + if (op_is_stateful(op) and not resource_inputs and op._control_flow_context is None): # pylint: disable=protected-access if None in last_op_using_resource_tensor: op._add_control_input(last_op_using_resource_tensor[None]) # pylint: disable=protected-access From 2805a7489d879b308df84b41fcf4db0ec0b374e2 Mon Sep 17 00:00:00 2001 From: Bhavani Subramanian Date: Fri, 19 Jul 2019 11:45:44 -0700 Subject: [PATCH 0166/3053] Undef'ed function macros correctly. --- tensorflow/core/util/mkl_util.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index 39df695699c..65aca5ab10d 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -2076,11 +2076,11 @@ inline bool IsConv1x1StrideNot1(memory::dims filter_dims, } #undef ENGINE_CPU -#undef GET_MEMORY_DESC_FROM_MEM_PTR(mem_ptr) -#undef GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(mem_ptr) -#undef MEMORY_CONSTRUCTOR(mem_desc, cpu_engine, data) -#undef MEMORY_CONSTRUCTOR_WITH_MEM_PD(mem_ptr, cpu_engine, data) -#undef MEMORY_CONSTRUCTOR_WITHOUT_DATA(mem_desc, cpu_engine) +#undef GET_MEMORY_DESC_FROM_MEM_PTR +#undef GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR +#undef MEMORY_CONSTRUCTOR +#undef MEMORY_CONSTRUCTOR_WITH_MEM_PD +#undef MEMORY_CONSTRUCTOR_WITHOUT_DATA #undef MEMORY_FORMAT #undef MKL_TENSOR_FORMAT #undef MKL_TENSOR_FORMAT_BLOCKED From 02874a2e4272733cd2148ab9498f3ee4a06dc2da Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Fri, 19 Jul 2019 11:35:58 -0700 Subject: [PATCH 0167/3053] [tf.data] Fixing a bug in TFRecordWriter. The problem was that the op kernel was not originally creating the `ResourceMgr` parameter of `IteratorContext`, which would cause any upstream dataset op that creates resources (such as `shuffle` or `cache`) to segfault. PiperOrigin-RevId: 259007273 --- tensorflow/core/kernels/data/experimental/to_tf_record_op.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc index 24262a50f11..9af8304735a 100644 --- a/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc +++ b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/core/framework/dataset.h" #include "tensorflow/core/framework/function_handle_cache.h" #include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/resource_mgr.h" #include "tensorflow/core/kernels/data/dataset_utils.h" #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/core/threadpool.h" @@ -71,6 +72,8 @@ class ToTFRecordOp : public AsyncOpKernel { std::unique_ptr function_handle_cache = absl::make_unique(params.flr); params.function_handle_cache = function_handle_cache.get(); + auto resource_mgr = absl::make_unique(); + params.resource_mgr = resource_mgr.get(); IteratorContext iter_ctx(std::move(params)); OP_REQUIRES_OK_ASYNC( From d79c21e9ae21d51960afc26fcd984e6a953693d6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 19 Jul 2019 11:38:52 -0700 Subject: [PATCH 0168/3053] [XLA] First implementation of memory space assignment pass. This introduces a new pass that assigns buffers to slow and large default memory space and fast and small alternate memory space. It greedily tries to place as many of the buffers as possible in the alternate memory. It determines the concrete offsets for the buffers that could be assigned in the alternate memory to account for fragmentation. If every buffer couldn't be kept in the alternate memory, it will prefetch and evict the buffers between the two memory spaces using asynchronous copy instructions (CopyStart/CopyDone). PiperOrigin-RevId: 259007791 --- tensorflow/compiler/xla/service/BUILD | 24 + .../compiler/xla/service/heap_simulator.cc | 82 ++-- .../compiler/xla/service/heap_simulator.h | 52 ++- .../compiler/xla/service/hlo_matchers.h | 2 + .../xla/service/memory_space_assignment.cc | 432 ++++++++++++++++++ .../xla/service/memory_space_assignment.h | 273 +++++++++++ .../service/memory_space_assignment_test.cc | 342 ++++++++++++++ 7 files changed, 1165 insertions(+), 42 deletions(-) create mode 100644 tensorflow/compiler/xla/service/memory_space_assignment.cc create mode 100644 tensorflow/compiler/xla/service/memory_space_assignment.h create mode 100644 tensorflow/compiler/xla/service/memory_space_assignment_test.cc diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index c4af8863c05..ce4c501ff07 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -2782,6 +2782,30 @@ tf_cc_test( ], ) +cc_library( + name = "memory_space_assignment", + srcs = ["memory_space_assignment.cc"], + hdrs = ["memory_space_assignment.h"], + deps = [ + ":heap_simulator", + ":hlo_pass", + ], +) + +tf_cc_test( + name = "memory_space_assignment_test", + srcs = ["memory_space_assignment_test.cc"], + deps = [ + ":hlo", + ":hlo_matchers", + ":memory_space_assignment", + "//tensorflow/compiler/xla:test", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:test", + ], +) + cc_library( name = "hlo_dce", srcs = ["hlo_dce.cc"], diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc index 83894f17445..8cc891ff33e 100644 --- a/tensorflow/compiler/xla/service/heap_simulator.cc +++ b/tensorflow/compiler/xla/service/heap_simulator.cc @@ -32,17 +32,18 @@ using absl::flat_hash_set; namespace { // FlattenSchedule walks through the instruction, and recurse into each called // computations. As it walks it also tracks down the ordinal number of each -// instruction in the schedule and store it in the `instruction_schedule`. The -// end of each computation is tracked in `computation_schedule`. +// instruction in the schedule and store it in the `instruction_schedule` and +// 'flattened_instruction_sequence`. The end of each computation is tracked in +// `computation_schedule`. int64 FlattenSchedule( const HloComputation& computation, const HloInstructionSequence& instruction_sequence, const HloSchedule* schedule, int64 start_time, + HloInstructionSequence* flattened_instruction_sequence, absl::flat_hash_map* instruction_schedule, absl::flat_hash_map* computation_schedule) { int64 time = start_time; - for (const HloInstruction* instruction : - instruction_sequence.instructions()) { + for (HloInstruction* instruction : instruction_sequence.instructions()) { if (schedule != nullptr) { // Recurse into sub computations if we have a module-scoped schedule. if (instruction->opcode() == HloOpcode::kCall || @@ -51,32 +52,37 @@ int64 FlattenSchedule( instruction->called_computations()) { const HloInstructionSequence& called_sequence = schedule->sequence(called_computation); - time = - FlattenSchedule(*called_computation, called_sequence, schedule, - time, instruction_schedule, computation_schedule); + time = FlattenSchedule(*called_computation, called_sequence, schedule, + time, flattened_instruction_sequence, + instruction_schedule, computation_schedule); computation_schedule->insert({called_computation, time}); } } if (instruction->opcode() == HloOpcode::kWhile) { const HloInstructionSequence& condition_sequence = schedule->sequence(instruction->while_condition()); - time = FlattenSchedule(*instruction->while_condition(), - condition_sequence, schedule, time, - instruction_schedule, computation_schedule); + time = + FlattenSchedule(*instruction->while_condition(), condition_sequence, + schedule, time, flattened_instruction_sequence, + instruction_schedule, computation_schedule); computation_schedule->insert({instruction->while_condition(), time}); const HloInstructionSequence& body_sequence = schedule->sequence(instruction->while_body()); - time = - FlattenSchedule(*instruction->while_body(), body_sequence, schedule, - time, instruction_schedule, computation_schedule); + time = FlattenSchedule(*instruction->while_body(), body_sequence, + schedule, time, flattened_instruction_sequence, + instruction_schedule, computation_schedule); } } if (instruction_schedule->count(instruction) != 0) { continue; } instruction_schedule->insert({instruction, time++}); + flattened_instruction_sequence->push_back(instruction); } computation_schedule->insert({&computation, time}); + DCHECK_EQ(instruction_schedule->size(), + flattened_instruction_sequence->size()); + DCHECK_EQ(instruction_schedule->size(), time); return time; } @@ -328,19 +334,18 @@ Status HeapSimulator::RunComputation( HloDataflowAnalysis& dataflow_analysis = alias_analysis.dataflow_analysis(); - // instruction_schedule and computation_schedule are the maps that track each - // instruction/computation and their ordinal in the schedule. - absl::flat_hash_map instruction_schedule; - absl::flat_hash_map computation_schedule; - // program_end_time is the time of the last instruction scheduled. It is equal // to the number of instructions in a computation. int64 program_end_time = FlattenSchedule(computation, instruction_sequence, schedule_, 0, - &instruction_schedule, &computation_schedule); + &flattened_instruction_sequence_, &instruction_schedule_, + &computation_schedule_); VLOG(1) << "Program end time: " << program_end_time; + algorithm_->SetSchedules(&flattened_instruction_sequence_, + &instruction_schedule_, &computation_schedule_); + // We track the definition and free events for each buffer, then we go through // each step and reply those events in program order. absl::flat_hash_map buffer_start_map; @@ -368,14 +373,14 @@ Status HeapSimulator::RunComputation( // Keeps track of buffer start time and buffer end time. for (const HloValue* value : dataflow_analysis.values()) { // Ignore buffers that are not defined. - if (instruction_schedule.count(value->defining_instruction()) == 0) { + if (instruction_schedule_.count(value->defining_instruction()) == 0) { continue; } if (IgnoreBuffer(value)) { continue; } values_to_assign.push_back(value); - int64 buffer_start_time = instruction_schedule[value->instruction()]; + int64 buffer_start_time = instruction_schedule_[value->instruction()]; int64 buffer_end_time = -1; // A buffer's live range ends when the last user finishes executing. @@ -391,13 +396,13 @@ Status HeapSimulator::RunComputation( VLOG(1) << "Moved value " << value->ToShortString() << " to while param: " << used->ToString(); } - if (instruction_schedule.count(used) == 0) { + if (instruction_schedule_.count(used) == 0) { // We didn't track the instruction `used`. This happens when we do // computation scope (versus module scope) heap simulation and when the // used instruction is outside of the computation being simulated. continue; } - buffer_end_time = std::max(buffer_end_time, instruction_schedule[used]); + buffer_end_time = std::max(buffer_end_time, instruction_schedule_[used]); } if (buffer_end_time == -1) { @@ -412,11 +417,11 @@ Status HeapSimulator::RunComputation( if (schedule_ == nullptr && &computation != position_comp) { continue; } - if (computation_schedule.count(position_comp) == 0) { + if (computation_schedule_.count(position_comp) == 0) { continue; } buffer_end_time = - std::max(buffer_end_time, computation_schedule[position_comp]); + std::max(buffer_end_time, computation_schedule_[position_comp]); } } @@ -910,8 +915,8 @@ GlobalDecreasingSizeBestFitHeap::GetSortedBufferIntervals() const { GlobalDecreasingSizeBestFitHeap::ChunkCandidate GlobalDecreasingSizeBestFitHeap::FindChunkCandidate( - const GlobalDecreasingSizeBestFitHeap::BufferInterval& buffer_interval) - const { + const GlobalDecreasingSizeBestFitHeap::BufferInterval& buffer_interval, + int64 preferred_offset) const { VLOG(1) << "Finding chunks for buffer: " << buffer_interval.buffer->ToString(); VLOG(1) << "Size " << buffer_interval.size << ", start " @@ -960,7 +965,16 @@ GlobalDecreasingSizeBestFitHeap::FindChunkCandidate( return; } - if (free_size < min_fit_chunk.size) { + // If a preferred offset is provided, pick that offset. + if (free_offset <= preferred_offset && + free_offset + free_size >= preferred_offset + buffer_interval.size) { + min_fit_chunk = {preferred_offset, buffer_interval.size}; + } + + // Pick the min-fit chunk only if we didn't have a preferred offset or a + // chunk at the preferred offset hasn't been found. + if ((preferred_offset < 0 || min_fit_chunk.offset != preferred_offset) && + free_size < min_fit_chunk.size) { min_fit_chunk = {free_offset, free_size}; } }; @@ -993,16 +1007,18 @@ void GlobalDecreasingSizeBestFitHeap::CommitChunk( interval_tree_.Add(buffer_interval.start, buffer_interval.end, chunk_candidate.chunk); for (auto colocation : GetTransitiveColocations(buffer_interval)) { - const auto emplace_result = - result_.chunk_map.emplace(colocation, chunk_candidate.chunk); - DCHECK(emplace_result.second); + AddToChunkMap(colocation, chunk_candidate.chunk); auto colocation_interval = buffer_intervals_[colocation]; interval_tree_.Add(colocation_interval.start, colocation_interval.end, chunk_candidate.chunk); } - const auto emplace_result = - result_.chunk_map.emplace(buffer_interval.buffer, chunk_candidate.chunk); + AddToChunkMap(buffer_interval.buffer, chunk_candidate.chunk); +} + +void GlobalDecreasingSizeBestFitHeap::AddToChunkMap(const HloValue* buffer, + Chunk chunk) { + const auto emplace_result = result_.chunk_map.emplace(buffer, chunk); DCHECK(emplace_result.second); } diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h index 4d6de377813..f70f6c2f013 100644 --- a/tensorflow/compiler/xla/service/heap_simulator.h +++ b/tensorflow/compiler/xla/service/heap_simulator.h @@ -204,6 +204,15 @@ class HeapSimulator { absl::flat_hash_set allocated_buffers_; absl::flat_hash_set freed_buffers_; + // The flattened sequence of all instructions in the module. It contains the + // same information as instruction_schedule_, but allows fast indexing using + // the schedule index. + HloInstructionSequence flattened_instruction_sequence_; + // instruction_schedule and computation_schedule are the maps that track each + // instruction/computation and their ordinal in the schedule. + absl::flat_hash_map instruction_schedule_; + absl::flat_hash_map computation_schedule_; + // Debugging information filled in while the heap simulator runs. HeapSimulatorTrace debug_trace_; }; @@ -255,6 +264,27 @@ class HeapAlgorithm { // Finish collects the buffer offset assignment results. Free may only be // called once, after the Alloc and Free calls. virtual Result Finish() = 0; + + // Heap algorithms can optionally make use of the instruction/computation + // schedule. These data structures are guaranteed to be valid while Finish() + // is being called. + virtual void SetSchedules( + const HloInstructionSequence* flattened_instruction_sequence, + const absl::flat_hash_map* + instruction_schedule, + const absl::flat_hash_map* + computation_schedule) { + flattened_instruction_sequence_ = flattened_instruction_sequence; + instruction_schedule_ = instruction_schedule; + computation_schedule_ = computation_schedule; + } + + protected: + const HloInstructionSequence* flattened_instruction_sequence_; + const absl::flat_hash_map* + instruction_schedule_; + const absl::flat_hash_map* + computation_schedule_; }; // NoFragmentationStatsHeap computes the heap size assuming no fragmentation; @@ -370,19 +400,24 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm { // These two methods below are exposed to other heap algorithms that inherit // from this class. The Finish() method tries to find a candidate chunk for - // each BufferInterval, after calling GetSortedBufferIntervals. The - // ChunkCandidate returns the chunk and the final heap size if it chunk is to - // be committed. The Finish() method can then call CommitChunk to associate - // the chunk with the BufferInterval, if the final heap size is within the - // limits. - ChunkCandidate FindChunkCandidate( - const BufferInterval& buffer_interval) const; + // each BufferInterval, after calling GetSortedBufferIntervals. If a + // non-negative preferred_offset is provided, FindChunkCandidate attempts + // finding a chunk at this offset. The ChunkCandidate returns the chunk and + // the final heap size if it chunk is to be committed. The Finish() method can + // then call CommitChunk to associate the chunk with the BufferInterval, if + // the final heap size is within the limits. + ChunkCandidate FindChunkCandidate(const BufferInterval& buffer_interval, + int64 preferred_offset = -1) const; void CommitChunk(const BufferInterval& buffer_interval, ChunkCandidate chunk_candidate); + // Adds the buffer and the chunk to the result chunk map. + virtual void AddToChunkMap(const HloValue* buffer, Chunk chunk); + + absl::flat_hash_map buffer_intervals_; + Result result_; private: int64 alignment_; - Result result_; Type type_; // The current time represented as an integer. It increments by 1 at each @@ -396,7 +431,6 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm { // returns all three of them. absl::flat_hash_set GetTransitiveColocations( const BufferInterval& interval) const; - absl::flat_hash_map buffer_intervals_; }; // A heap algorithm that chooses the best results from other algorithms added to diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h index a75fc0bbc3f..789ec5d21a9 100644 --- a/tensorflow/compiler/xla/service/hlo_matchers.h +++ b/tensorflow/compiler/xla/service/hlo_matchers.h @@ -215,6 +215,8 @@ HLO_MATCHER(Constant); HLO_MATCHER(Convert); HLO_MATCHER(Convolution); HLO_MATCHER(Copy); +HLO_MATCHER(CopyDone); +HLO_MATCHER(CopyStart); HLO_MATCHER(AllReduce); HLO_MATCHER(CollectivePermute); HLO_MATCHER(Divide); diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc new file mode 100644 index 00000000000..f08cf01e582 --- /dev/null +++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc @@ -0,0 +1,432 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/memory_space_assignment.h" + +namespace xla { + +namespace { +// Define a dummy chunk for chunks that will be allocated in the default memory +// space. +const HeapSimulator::Chunk kDefaultMemorySpaceDummyChunk{-1, -1}; +} // namespace + +std::vector +AlternateMemoryBestFitHeap::GetSortedColocatedIntervals( + const GlobalDecreasingSizeBestFitHeap::BufferInterval& interval) const { + std::vector colocated_intervals; + std::vector worklist = {&interval}; + while (!worklist.empty()) { + const BufferInterval* item = worklist.back(); + worklist.pop_back(); + colocated_intervals.push_back(item); + for (const HloValue* buffer_colocated : item->colocations) { + worklist.push_back(&buffer_intervals_.at(buffer_colocated)); + } + } + + absl::c_sort(colocated_intervals, [&](const BufferInterval* x, + const BufferInterval* y) { + return std::make_pair(x->start, x->end) < std::make_pair(y->start, y->end); + }); + return colocated_intervals; +} + +HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() { + std::vector sorted_buffer_intervals = + GetSortedBufferIntervals(); + + VLOG(1) << "Assigning buffers to alternate memory. Max heap size = " + << max_size_in_bytes_ + << ", min prefetch interval = " << min_prefetch_interval_ + << ", max prefetch interval = " << max_prefetch_interval_; + + for (auto& interval : sorted_buffer_intervals) { + if (!interval.need_allocation) { + continue; + } + + // Skip if we have already allocated for this buffer. + const HloBuffer& buffer = + alias_analysis_.GetBufferContainingValue(*interval.buffer); + if (allocation_map_->contains(&buffer)) { + continue; + } + + auto colocated_intervals = GetSortedColocatedIntervals(interval); + bool keep_in_default_memory = false; + for (const BufferInterval* colocated_interval : colocated_intervals) { + const HloValue* value = colocated_interval->buffer; + // If any of the colocated values are phi buffers, we keep them in the + // default memory for now. + if (value->is_phi()) { + keep_in_default_memory = true; + VLOG(4) << "Keeping value " << value->ToShortString() + << " because it contains a phi node."; + break; + } + } + + MemorySpaceAssignment::AllocationSequence* allocation_sequence = + &(*allocation_map_)[&buffer]; + if (keep_in_default_memory) { + continue; + } + + // At this point, none of the colocated buffers contain any phi buffers. + for (const BufferInterval* colocated_interval : colocated_intervals) { + const HloValue* value = colocated_interval->buffer; + int64 definition_time = + instruction_schedule_->at(value->defining_instruction()); + // Iterate over the uses. + for (HloUse use : value->uses()) { + int64 use_time = instruction_schedule_->at(use.instruction); + + FindAllocation(definition_time, use_time, use, *colocated_interval, + allocation_sequence); + // If there are multiple uses, they can try using the memory allocation + // already at the alternate memory. + definition_time = use_time; + } + } + } + + if (VLOG_IS_ON(3)) { + for (const auto& alloc_pair : *allocation_map_) { + VLOG(3) << "Allocation for " << alloc_pair.first->ToString(); + for (const auto& alloc : alloc_pair.second) { + std::string addr_str = ": default"; + if (alloc->memory_space() == MemorySpace::kAlternate) { + addr_str = absl::StrCat(": alt ", alloc->chunk().offset); + } + + VLOG(3) << " " << alloc->start_time() << "-" << alloc->end_time() + << addr_str << ", " << alloc->uses().size() << " uses"; + } + } + } + + return result_; +} + +HloInstruction* AlternateMemoryBestFitHeap::GetInstructionAt(int64 time) const { + return flattened_instruction_sequence_->instructions()[time]; +} + +void AlternateMemoryBestFitHeap::FindAllocation( + int64 start_time, int64 end_time, HloUse use, + const BufferInterval& interval, + MemorySpaceAssignment::AllocationSequence* allocations) { + HloInstruction* def_instruction = + use.instruction->mutable_operand(use.operand_number); + // Create an alternate memory interval that starts at the earliest + // possible position, given by max_prefetch_interval. + BufferInterval alternate_mem_interval; + alternate_mem_interval.buffer = interval.buffer; + alternate_mem_interval.size = interval.size; + alternate_mem_interval.start = + std::max(start_time, end_time - max_prefetch_interval_); + alternate_mem_interval.end = end_time; + + VLOG(2) << "Finding allocation for " << interval.buffer->ToShortString() + << " (" << start_time << ", " << end_time + << "). Size = " << interval.size; + + MemorySpaceAssignment::Allocation* prev_allocation = nullptr; + bool can_eliminate_copy = false; + if (allocations->empty()) { + // There hasn't been any allocations for this interval so far. We can + // eliminate copy if the value can be placed in the alternate memory. + can_eliminate_copy = is_allowed_in_alternate_mem_(*interval.buffer); + } else { + // If there has been a previous allocation, we can eliminate the copy if the + // previous allocation was also in the alternate memory. + prev_allocation = allocations->back().get(); + can_eliminate_copy = + (prev_allocation->memory_space() == MemorySpace::kAlternate); + } + + if (alternate_mem_interval.start == start_time && can_eliminate_copy) { + // Prefer the offset that was previously used for the previous allocation. + int64 preferred_offset = -1; + if (prev_allocation != nullptr) { + preferred_offset = prev_allocation->chunk().offset; + // If there is a previous allocation, set the start time one after the end + // of the previous allocation's end. + alternate_mem_interval.start = prev_allocation->end_time() + 1; + } + + VLOG(4) << "We can eliminate copy to alternate memory. Preferred offset = " + << preferred_offset; + ChunkCandidate chunk_candidate = + FindChunkCandidate(alternate_mem_interval, preferred_offset); + // Check if the new heap size fits within limits. Also ensure if a + // preferred offset was provided, that offset was used. + if (chunk_candidate.heap_size < max_size_in_bytes_ && + (preferred_offset == -1 || + preferred_offset == chunk_candidate.chunk.offset)) { + VLOG(3) << "Keep the buffer in alternate memory. Offset = " + << chunk_candidate.chunk.offset + << ", size = " << chunk_candidate.chunk.size + << ", heap_size = " << chunk_candidate.heap_size; + CommitChunk(alternate_mem_interval, chunk_candidate); + + // If there was a previous allocation, the buffer location is the + // same as the previous. Otherwise, it is the operand. + if (prev_allocation != nullptr && + prev_allocation->defining_instruction() == def_instruction) { + prev_allocation->Extend(end_time); + } else { + allocations->push_back( + absl::make_unique( + def_instruction, MemorySpace::kAlternate, chunk_candidate.chunk, + start_time, end_time)); + } + allocations->back()->AddUse(use); + return; + } + } + + // Since copies couldn't be removed, create an allocation in the default + // memory space. + if (prev_allocation != nullptr && + prev_allocation->memory_space() == MemorySpace::kAlternate && + prev_allocation->defining_instruction() == def_instruction) { + // If there was an allocation for this HloValue that was in the alternate + // memory space, we also need to perform an eviction. + // TODO(berkin): For now evictions happen relative to the most recent + // allocation in the alternate memory. We can potentially start evictions + // earlier and end later. + HloInstruction* earliest_instruction = + GetInstructionAt(prev_allocation->start_time()); + HloInstruction* latest_instruction = + GetInstructionAt(prev_allocation->end_time()); + + VLOG(3) << "Evicting buffer at " << prev_allocation->chunk().offset << " (" + << prev_allocation->start_time() << ", " + << prev_allocation->end_time() << ")"; + VLOG(3) << "Copy to default mem between instructions " + << earliest_instruction->ToString() << " - " + << latest_instruction->ToString(); + + // The live range of this buffer is from the start time of the previous + // buffer that was in the alternate memory so that a buffer is allocated + // during the copy. + allocations->push_back( + absl::make_unique( + *prev_allocation, MemorySpace::kDefault, + kDefaultMemorySpaceDummyChunk, prev_allocation->start_time(), + end_time, earliest_instruction, latest_instruction)); + } else if (prev_allocation != nullptr && + prev_allocation->memory_space() == MemorySpace::kDefault && + prev_allocation->defining_instruction() == def_instruction) { + // If the previous allocation was in the default memory space and was + // defined by the same instruction, extend that. Otherwise, create a new + // allocation. + prev_allocation->Extend(end_time); + } else { + allocations->push_back(absl::make_unique( + def_instruction, MemorySpace::kDefault, kDefaultMemorySpaceDummyChunk, + start_time, end_time)); + } + + // Try partially placing the buffer in the alternate space. The time that is + // overlapped will be used to asynchronously copy the buffer from the + // default memory to the alternate memory. + // + // start end + // time time + // X---------------------X + // Alternate: +------+ + // Default: +---------------------+ + // ^ ^ + // Copy Copy + // Start Done + for (alternate_mem_interval.start = + std::max(start_time, end_time - max_prefetch_interval_); + alternate_mem_interval.end - alternate_mem_interval.start > + min_prefetch_interval_; + ++alternate_mem_interval.start) { + VLOG(4) << "Trying alternate memory allocation (" + << alternate_mem_interval.start << ", " + << alternate_mem_interval.end << ")"; + ChunkCandidate chunk_candidate = FindChunkCandidate(alternate_mem_interval); + // Check if the new heap size fits within limits. + if (chunk_candidate.heap_size < max_size_in_bytes_) { + HloInstruction* earliest_instruction = + GetInstructionAt(alternate_mem_interval.start); + VLOG(3) << "Move the buffer to alternate memory at " + << alternate_mem_interval.start + << ". Offset = " << chunk_candidate.chunk.offset + << ", size = " << chunk_candidate.chunk.size + << ", heap_size = " << chunk_candidate.heap_size; + VLOG(3) << "Copy to alternate mem between instructions " + << earliest_instruction->ToString() << " - " + << use.instruction->ToString(); + CommitChunk(alternate_mem_interval, chunk_candidate); + + // Since copies couldn't be removed, create an allocation in the + // default memory space. + allocations->push_back( + absl::make_unique( + *allocations->back().get(), MemorySpace::kAlternate, + chunk_candidate.chunk, alternate_mem_interval.start, end_time, + earliest_instruction, use.instruction)); + allocations->back()->AddUse(use); + return; + } + } + + // If a copy wasn't inserted, then add this use to the latest allocation. + allocations->back()->AddUse(use); +} + +/*static*/ StatusOr MemorySpaceAssignment::Run( + HloModule* module, int64 alternate_memory_space, int64 max_size_in_bytes, + int64 min_prefetch_interval, int64 max_prefetch_interval, + int64 alternate_memory_space_alignment_in_bytes, + BufferValue::SizeFunction size_fn, + AlternateMemoryBestFitHeap::IsAllowedInAlternateMemoryFunction + is_allowed_in_alternate_mem) { + CHECK(module->has_schedule()); + VLOG(4) << "Module before memory space assignment: " << module->ToString(); + VLOG(4) << "Schedule: " << module->schedule().ToString(); + TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(module)); + + MemorySpaceAssignment memory_space_assignment(module, alternate_memory_space); + // TODO(berkin): Explore heap algorithms other than kSpatial. + auto algorithm = absl::make_unique( + &memory_space_assignment.allocation_map_, max_size_in_bytes, + min_prefetch_interval, max_prefetch_interval, *alias_analysis, + alternate_memory_space_alignment_in_bytes, + GlobalDecreasingSizeBestFitHeap::Type::kSpatial, + is_allowed_in_alternate_mem); + + TF_RETURN_IF_ERROR(HeapSimulator::Run(std::move(algorithm), *module, + module->schedule(), + *alias_analysis.get(), size_fn) + .status()); + + TF_RETURN_IF_ERROR(memory_space_assignment.Process()); + TF_RETURN_IF_ERROR(memory_space_assignment.FixSchedule()); + + VLOG(4) << "Module after memory space assignment: " << module->ToString(); + VLOG(4) << "Schedule: " << module->schedule().ToString(); + TF_CHECK_OK(module->schedule().Verify()); + + return true; +} + +Status MemorySpaceAssignment::Allocation::Process( + MemorySpaceAssignment* memory_space_assignment) { + // For non-copy allocations, all we need to do is to update the output memory + // space if placed in the alternate memory. + if (memory_space_ == MemorySpace::kAlternate) { + Layout* layout = defining_instruction_->mutable_shape()->mutable_layout(); + layout->set_memory_space(memory_space_assignment->alternate_memory_space_); + } + return Status::OK(); +} + +Status MemorySpaceAssignment::CopyAllocation::Process( + MemorySpaceAssignment* memory_space_assignment) { + // Copy allocations need to insert asynchronous copy nodes. + HloInstruction* def_instruction = defining_instruction(); + CHECK_NE(def_instruction, nullptr); + + Shape shape = def_instruction->shape(); + HloComputation* computation = def_instruction->parent(); + + // Set the layout to include the memory space. + Layout* layout = shape.mutable_layout(); + if (memory_space_ == MemorySpace::kAlternate) { + layout->set_memory_space(memory_space_assignment->alternate_memory_space_); + } else { + layout->set_memory_space(0); + } + + HloInstruction* copy_start = + computation->AddInstruction(HloInstruction::CreateUnary( + ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})}), + HloOpcode::kCopyStart, def_instruction)); + HloInstruction* copy_done = computation->AddInstruction( + HloInstruction::CreateUnary(shape, HloOpcode::kCopyDone, copy_start)); + // Update the allocation with the defining instruction so that if there + // are further copies from it, it can find the correct instruction. + defining_instruction_ = copy_done; + + // Replace all the uses with the new copy instruction. + for (HloUse use : uses_) { + TF_RETURN_IF_ERROR( + use.instruction->ReplaceOperandWith(use.operand_number, copy_done)); + } + + // Insert the new instructions at the appropriate places in the schedule. + // FixSchedule will process the maps to actually insert them. + memory_space_assignment->ScheduleAsynchronousCopy( + copy_start, copy_start_schedule_after_, copy_done, + copy_done_schedule_before_); + return Status::OK(); +} + +Status MemorySpaceAssignment::Process() { + // Insert CopyStart/CopyDone pairs. + for (auto& buffer_and_sequence : allocation_map_) { + for (auto& allocation : buffer_and_sequence.second) { + TF_RETURN_IF_ERROR(allocation->Process(this)); + } + } + return Status::OK(); +} + +void MemorySpaceAssignment::ScheduleAsynchronousCopy( + HloInstruction* copy_start, HloInstruction* copy_start_schedule_after, + HloInstruction* copy_done, HloInstruction* copy_done_schedule_before) { + schedule_after_[copy_start_schedule_after].push_back(copy_start); + schedule_before_[copy_done_schedule_before].push_back(copy_done); +} + +Status MemorySpaceAssignment::FixSchedule() { + CHECK(module_->has_schedule()); + HloSchedule& schedule = module_->schedule(); + for (const HloComputation* computation : module_->computations()) { + const HloInstructionSequence& sequence = schedule.sequence(computation); + HloInstructionSequence new_sequence; + + for (HloInstruction* instruction : sequence.instructions()) { + auto insts_before_iter = schedule_before_.find(instruction); + if (insts_before_iter != schedule_before_.end()) { + for (HloInstruction* new_instruction : insts_before_iter->second) { + new_sequence.push_back(new_instruction); + VLOG(4) << "before: " << new_instruction->ToString(); + } + } + new_sequence.push_back(instruction); + VLOG(4) << instruction->ToString(); + auto insts_after_iter = schedule_after_.find(instruction); + if (insts_after_iter != schedule_after_.end()) { + for (HloInstruction* new_instruction : insts_after_iter->second) { + new_sequence.push_back(new_instruction); + VLOG(4) << "after: " << new_instruction->ToString(); + } + } + } + schedule.set_sequence(computation, new_sequence); + } + + return Status::OK(); +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h new file mode 100644 index 00000000000..556013032af --- /dev/null +++ b/tensorflow/compiler/xla/service/memory_space_assignment.h @@ -0,0 +1,273 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_ + +#include "tensorflow/compiler/xla/service/heap_simulator.h" +#include "tensorflow/compiler/xla/service/hlo_pass_interface.h" + +namespace xla { + +// MemorySpaceAssignment assigns memory spaces (default or alternate) to each +// instruction in the module. It will greedily try placing as as many values in +// the alternate memory space as possible. It uses the heap simulator to +// determine the actual allocation offsets of values in the alternate memory +// space to account for fragmentation. The default memory space is assumed to be +// large enough to hold the values that could not be placed in the alternate +// memory space. +class MemorySpaceAssignment { + public: + using Chunk = HeapSimulator::Chunk; + + // MemorySpaceAssignment uses a notion of a slow and large default memory + // space and a fast and small alternate memory space. + enum class MemorySpace { kDefault, kAlternate }; + + // This class represents an allocation that might either be in the default or + // alternate memory. An HloValue might live in multiple different allocations + // over its lifetime. The lifetimes of the allocations are defined using + // start_time and end_time, which corresponds to the instruction indexes in + // the flattened schedule. Each of these allocations might partially overlap + // with each other. CopyAllocation defined below represents asynchronous + // copies between Allocations. + // + // Consider an instruction Foo, and its users Bar and Baz, and the times given + // in terms of the flattened schedule of the entire module: + // + // Foo:10 + // / \ + // Bar:14 \ + // Baz:25 + // + // A valid memory space assignment could be like the following: + // + // Time: 10 ... 14 ... 25 + // Foo Bar Baz + // Alternate +-------+ +-----+ + // Default +---------------------+ + // ^ ^ ^ ^ + // | | | | + // evict evict prefetch prefetch + // start end start end + // + // This would be represented with: + // - Allocation(memory_space=kAlternate, start_time=10, end_time=14) + // - CopyAllocation(memory_space=kDefault, start_time=12, end_time=25) + // - CopyAllocation(memory_space=kAlternate, start_time=22, end_time=25) + class Allocation { + public: + Allocation(HloInstruction* defining_instruction, MemorySpace memory_space, + Chunk chunk, int64 start_time, int64 end_time) + : defining_instruction_(defining_instruction), + memory_space_(memory_space), + chunk_(chunk), + start_time_(start_time), + end_time_(end_time) {} + virtual ~Allocation() = default; + + // Adds a use to this allocation. + void AddUse(HloUse use) { uses_.push_back(use); } + + // Extends the end time of this allocation. + void Extend(int64 end_time) { end_time_ = end_time; } + + // After all of the time ranges for the allocations have been assigned, + // Process morphs the instructions affected to assign the memory spaces and + // insert asynchronous copy instructions if necessary. + virtual Status Process(MemorySpaceAssignment* memory_space_assignment); + + // Returns the defining instruction for this allocation. + virtual HloInstruction* defining_instruction() const { + return defining_instruction_; + } + + const std::vector& uses() const { return uses_; } + MemorySpace memory_space() const { return memory_space_; } + Chunk chunk() const { return chunk_; } + int64 start_time() const { return start_time_; } + int64 end_time() const { return end_time_; } + + protected: + HloInstruction* defining_instruction_; + std::vector uses_; + MemorySpace memory_space_; + Chunk chunk_; + int64 start_time_; + int64 end_time_; + }; + + // This class represents an allocation as a result of an asynchronous copy. + class CopyAllocation : public Allocation { + public: + CopyAllocation(const Allocation& prev_allocation, MemorySpace memory_space, + Chunk chunk, int64 start_time, int64 end_time, + HloInstruction* copy_start_schedule_after, + HloInstruction* copy_done_schedule_before) + : Allocation(/*defining_instruction=*/nullptr, memory_space, chunk, + start_time, end_time), + prev_allocation_(prev_allocation), + copy_start_schedule_after_(copy_start_schedule_after), + copy_done_schedule_before_(copy_done_schedule_before) {} + + Status Process(MemorySpaceAssignment* memory_space_assignment) override; + + HloInstruction* defining_instruction() const override { + // Unless explicitly set, the defining instruction of a copy allocation in + // retrieved from the previous allocation. + if (defining_instruction_ != nullptr) { + return defining_instruction_; + } else { + return prev_allocation_.defining_instruction(); + } + } + + private: + const Allocation& prev_allocation_; + // These variables define the scheduling boundaries where CopyStart and + // CopyDone can be scheduled. The earliest CopyStart can be scheduled is + // after copy_start_schedule_after_ and the latest CopyDone can be scheduled + // is before copy_done_schedule_before_. + HloInstruction* copy_start_schedule_after_; + HloInstruction* copy_done_schedule_before_; + }; + + using AllocationSequence = std::list>; + using AllocationMap = + absl::flat_hash_map; + + // Runs the MemorySpaceAssignment pass. alternate_memory_space is the + // architecture-specific integer value that describes the alternate memory. + // max_size_in_bytes is the maximum size of the alternate memory. + // min/max_prefetch_interval define min/max number of independent instructions + // that can be overlapped while prefetching to decide how early can prefetch + // begin. alternate_memory_space_alignment_in_bytes is the alignment required + // in the alternate memory space, size_fn is the size function for buffer + // values, and is_allowed_in_alternate_mem can be used to prevent certain + // HloValues (e.g., based on the opcode) to be placed on the alternate memory. + // TODO(berkin): Use the cost model instead of using number of instructions to + // decide how early to prefetch. + static StatusOr Run( + HloModule* module, int64 alternate_memory_space, int64 max_size_in_bytes, + int64 min_prefetch_interval, int64 max_prefetch_interval, + int64 alternate_memory_space_alignment_in_bytes, + BufferValue::SizeFunction size_fn, + std::function is_allowed_in_alternate_mem); + + private: + MemorySpaceAssignment(HloModule* module, int64 alternate_memory_space) + : module_(module), alternate_memory_space_(alternate_memory_space) {} + + // Process calls Process methods of the allocations after the allocations have + // been finalized. + Status Process(); + + // FixSchedule inserts asynchronous copies in the schedule. + Status FixSchedule(); + + // Schedules a pair of asynchronous copy instructions (copy_start and + // copy_done) where copy_start will be scheduled after the instruction in + // copy_start_schedule_after and copy_done will be scheduled before the + // instruction in copy_done_schedule_before. + void ScheduleAsynchronousCopy(HloInstruction* copy_start, + HloInstruction* copy_start_schedule_after, + HloInstruction* copy_done, + HloInstruction* copy_done_schedule_before); + + HloModule* module_; + int64 alternate_memory_space_; + AllocationMap allocation_map_; + + // These maps hold vectors of new instructions that need to be scheduled after + // (or before) the instruction in the key. FixSchedule uses these maps to + // modify and fix the schedule. + absl::flat_hash_map> + schedule_after_; + absl::flat_hash_map> + schedule_before_; +}; + +// This class inherits from GlobalDecreasingSizeBestFitHeap with a notion of +// maximum size. +class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap { + public: + using IsAllowedInAlternateMemoryFunction = + std::function; + using MemorySpace = MemorySpaceAssignment::MemorySpace; + + AlternateMemoryBestFitHeap( + MemorySpaceAssignment::AllocationMap* allocation_map, + int64 max_size_in_bytes, int64 min_prefetch_interval, + int64 max_prefetch_interval, const HloAliasAnalysis& alias_analysis, + int64 alignment, GlobalDecreasingSizeBestFitHeap::Type type, + IsAllowedInAlternateMemoryFunction is_allowed_in_alternate_mem) + : GlobalDecreasingSizeBestFitHeap(alignment, type), + allocation_map_(allocation_map), + max_size_in_bytes_(max_size_in_bytes), + min_prefetch_interval_(min_prefetch_interval), + max_prefetch_interval_(max_prefetch_interval), + alias_analysis_(alias_analysis), + is_allowed_in_alternate_mem_(is_allowed_in_alternate_mem) {} + + HeapSimulator::Result Finish() override; + + private: + // Finds an allocation for the given interval. Internally, it will attempt to + // find a suitable chunk candidate within the heap size and prefetch interval + // limits, and append the new allocation(s) to allocations. The new + // allocations can be in default or alternate memory spaces, or can be + // prefetches or evictions. + void FindAllocation(int64 start_time, int64 end_time, HloUse use, + const BufferInterval& interval, + MemorySpaceAssignment::AllocationSequence* allocations); + + // Returns the instruction at a particular time in the flattened instruction + // schedule. + HloInstruction* GetInstructionAt(int64 time) const; + + // Given a buffer interval, returns the colocated intervals. Unlike the + // similar GlobalDecreasingSizeBestFitHeap::GetTransitiveColocations, it + // returns the colocated intervals sorted by scheduled time. + std::vector GetSortedColocatedIntervals( + const BufferInterval& interval) const; + + // Since the allocations are recorded to the AllocationMap, we don't maintain + // result_ in GlobalDecreasingSizeBestFitHeap. Override AddToChunkMap to avoid + // unnecessarily adding the chunk to the chunk map. + void AddToChunkMap(const HloValue* buffer, Chunk chunk) override {} + + MemorySpaceAssignment::AllocationMap* allocation_map_; + int64 max_size_in_bytes_; + // The min and max prefetch intervals decribe the number of independent HLOs + // overlapped while a value is being prefetched into the alternate memory + // (between CopyStart and CopyDone HLO instructions). max_prefetch_interval + // attempts to prevent bringing tensors into the alternate memory too eagerly + // and hence occupying the space for other tensors which might use it. + // min_prefetch_interval attempts to prevent cases where tensors are + // prefetched into the alternate memory without sufficient time for the copy + // to take place. In those cases, it's just better to keep the tensor in the + // default memory instead of hurting the critical path with this copy that + // likely won't finish in time. + // TODO(berkin): Explore heuristics that take into account the cost of copying + // tensors between alternate and default memories. + int64 min_prefetch_interval_; + int64 max_prefetch_interval_; + const HloAliasAnalysis& alias_analysis_; + IsAllowedInAlternateMemoryFunction is_allowed_in_alternate_mem_; +}; + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_ diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc new file mode 100644 index 00000000000..5d6d0c81640 --- /dev/null +++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc @@ -0,0 +1,342 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/memory_space_assignment.h" + +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_matchers.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" + +namespace xla { +namespace { + +namespace op = xla::testing::opcode_matchers; + +class MemorySpaceAssignmentTest : public HloTestBase { + protected: + // We use the following two memory space values to describe the default (slow + // and large) and alternate (fast and small) memory spaces. + const int64 kDefaultMemorySpace = 0; + const int64 kAlternateMemorySpace = 1; + + void AssignMemorySpace(HloModule* module) { + auto size_fn = [](const BufferValue& buffer) { + return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8); + }; + + auto is_allowed_in_alternate_mem = [](const HloValue& value) { + // Check if the value belongs to the entry computation. + HloInstruction* instruction = value.instruction(); + HloComputation* computation = instruction->parent(); + bool in_entry_computation = + (computation == computation->parent()->entry_computation()); + if (in_entry_computation && + instruction->opcode() == HloOpcode::kParameter) { + return false; + } + return true; + }; + + ASSERT_IS_OK(MemorySpaceAssignment::Run( + module, kAlternateMemorySpace, /*max_size_in_bytes=*/128, + /*min_prefetch_interval=*/2, + /*max_prefetch_interval=*/10, + /*alternate_memory_space_alignment_in_bytes=*/8, size_fn, + is_allowed_in_alternate_mem) + .status()); + } +}; + +TEST_F(MemorySpaceAssignmentTest, ParameterOnly) { + // A module consisting of a single parameter. Inputs/outputs are currently + // excluded from memory space assignment. + HloComputation::Builder builder(TestName()); + Shape shape = ShapeUtil::MakeShape(F32, {2, 3}); + HloInstruction* p0 = + builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0")); + + auto module = CreateNewVerifiedModule(); + HloComputation* computation = module->AddEntryComputation(builder.Build()); + + HloSchedule schedule(module.get()); + schedule.set_sequence(computation, {p0}); + TF_CHECK_OK(module->set_schedule(schedule)); + + AssignMemorySpace(module.get()); + + EXPECT_THAT(p0, op::ShapeWithLayout(shape)); +} + +TEST_F(MemorySpaceAssignmentTest, Simple) { + // A simple module with a few simple instructions. Expect this to be + // transformed with CopyStart and CopyDone instructions inserted after inputs + // and before outputs. + HloComputation::Builder builder(TestName()); + Shape shape = ShapeUtil::MakeShape(F32, {2, 3}); + HloInstruction* p0 = + builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0")); + HloInstruction* p1 = + builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1")); + HloInstruction* add = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, p0, p1)); + HloInstruction* sub = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kSubtract, p0, p1)); + HloInstruction* mul = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, add, sub)); + + auto module = CreateNewVerifiedModule(); + HloComputation* computation = module->AddEntryComputation(builder.Build()); + + HloSchedule schedule(module.get()); + schedule.set_sequence(computation, {p0, p1, add, sub, mul}); + TF_CHECK_OK(module->set_schedule(schedule)); + + AssignMemorySpace(module.get()); + + // Inputs and outputs are currently placed in the default memory. Everything + // else should be in the alternate memory. + Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithLayout( + F32, {2, 3}, + /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0, + kAlternateMemorySpace); + EXPECT_THAT(p0, op::ShapeWithLayout(shape)); + EXPECT_THAT(p1, op::ShapeWithLayout(shape)); + EXPECT_THAT(mul, op::ShapeWithLayout(shape)); + EXPECT_THAT(add, op::ShapeWithLayout(shape_in_alternate_mem)); + EXPECT_THAT(sub, op::ShapeWithLayout(shape_in_alternate_mem)); +} + +TEST_F(MemorySpaceAssignmentTest, NegateChain) { + // The negate chain is long enough for asynchronous copy to be inserted + // between p1 and add. + HloComputation::Builder builder(TestName()); + Shape shape = ShapeUtil::MakeShape(F32, {2, 3}); + HloInstruction* p0 = + builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0")); + HloInstruction* p1 = + builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1")); + HloInstruction* negate0 = builder.AddInstruction( + HloInstruction::CreateUnary(shape, HloOpcode::kNegate, p0)); + HloInstruction* negate1 = builder.AddInstruction( + HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate0)); + HloInstruction* negate2 = builder.AddInstruction( + HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate1)); + HloInstruction* negate3 = builder.AddInstruction( + HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate2)); + HloInstruction* negate4 = builder.AddInstruction( + HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate3)); + HloInstruction* negate5 = builder.AddInstruction( + HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate4)); + HloInstruction* negate6 = builder.AddInstruction( + HloInstruction::CreateUnary(shape, HloOpcode::kNegate, negate5)); + HloInstruction* add = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, negate6, p1)); + + auto module = CreateNewVerifiedModule(); + HloComputation* computation = module->AddEntryComputation(builder.Build()); + + HloSchedule schedule(module.get()); + schedule.set_sequence(computation, {p0, p1, negate0, negate1, negate2, + negate3, negate4, negate5, negate6, add}); + TF_CHECK_OK(module->set_schedule(schedule)); + + AssignMemorySpace(module.get()); + + EXPECT_THAT(add, op::Add(op::Negate(), op::AsyncCopy(kAlternateMemorySpace, + kDefaultMemorySpace, + op::Parameter(1)))); + // Parameters are in the default memory space. + EXPECT_THAT(p0, op::ShapeWithLayout(shape)); + EXPECT_THAT(p1, op::ShapeWithLayout(shape)); + // Negate instructions are in the alternate memory space (1). + Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithLayout( + F32, {2, 3}, + /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0, + kAlternateMemorySpace); + EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem)); + EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem)); + EXPECT_THAT(negate2, op::ShapeWithLayout(shape_in_alternate_mem)); + EXPECT_THAT(negate3, op::ShapeWithLayout(shape_in_alternate_mem)); + EXPECT_THAT(negate4, op::ShapeWithLayout(shape_in_alternate_mem)); + EXPECT_THAT(negate5, op::ShapeWithLayout(shape_in_alternate_mem)); + EXPECT_THAT(negate6, op::ShapeWithLayout(shape_in_alternate_mem)); + // Ensure the CopyStart/CopyDone schedules. + const HloInstructionSequence& sequence = + module->schedule().sequence(computation); + EXPECT_THAT(sequence.instructions()[0], op::Parameter(0)); + EXPECT_THAT(sequence.instructions()[1], op::Parameter(1)); + EXPECT_THAT(sequence.instructions()[2], op::CopyStart()); + EXPECT_THAT(sequence.instructions()[10], op::CopyDone()); +} + +TEST_F(MemorySpaceAssignmentTest, EvictAndPrefetch) { + HloComputation::Builder builder(TestName()); + Shape shape = ShapeUtil::MakeShape(F32, {2, 3}); + HloInstruction* p0 = + builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0")); + HloInstruction* p1 = + builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1")); + HloInstruction* tanh = builder.AddInstruction( + HloInstruction::CreateUnary(shape, HloOpcode::kTanh, p0)); + // tanh should be placed in the alternate memory since there isn't much + // contention in the beginning. However, tanh has another consumer at the end. + // So it should be kicked out to default memory and prefetched back in. + // The graph below is meant to increase the contention to force + // eviction/prefetch behavior. + HloInstruction* a = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, p0, tanh)); + HloInstruction* b = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kSubtract, p0, p1)); + HloInstruction* c = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, p0, p1)); + HloInstruction* d = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kSubtract, p0, p1)); + HloInstruction* e = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, a, b)); + HloInstruction* f = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, a, c)); + HloInstruction* g = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, a, d)); + HloInstruction* h = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, b, c)); + HloInstruction* i = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, b, d)); + HloInstruction* j = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kMultiply, c, d)); + HloInstruction* k = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, e, f)); + HloInstruction* l = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, g, h)); + HloInstruction* m = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, i, j)); + HloInstruction* n = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, k, l)); + HloInstruction* o = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, n, m)); + // tanh is being used at the root instruction, and this should be prefetched. + HloInstruction* add = builder.AddInstruction( + HloInstruction::CreateBinary(shape, HloOpcode::kAdd, o, tanh)); + + auto module = CreateNewVerifiedModule(); + HloComputation* computation = module->AddEntryComputation(builder.Build()); + + HloSchedule schedule(module.get()); + schedule.set_sequence(computation, {p0, p1, tanh, a, b, c, d, e, f, g, h, i, + j, k, l, m, n, o, add}); + TF_CHECK_OK(module->set_schedule(schedule)); + + AssignMemorySpace(module.get()); + + EXPECT_THAT( + add, + op::Add(op::Add(), + op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace, + op::AsyncCopy(kDefaultMemorySpace, + kAlternateMemorySpace, op::Tanh())))); +} + +TEST_F(MemorySpaceAssignmentTest, While) { + auto module = CreateNewVerifiedModule(); + Shape shape = ShapeUtil::MakeShape(xla::F32, {2, 3}); + Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {}); + Shape tuple_shape = ShapeUtil::MakeTupleShape({shape, scalar_shape}); + + auto cond_builder = HloComputation::Builder("WhileCond"); + // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element) + HloInstruction* cond_param = cond_builder.AddInstruction( + HloInstruction::CreateParameter(0, tuple_shape, "cond_param")); + HloInstruction* cond_iter = cond_builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_shape, cond_param, 1)); + HloInstruction* cond_limit = cond_builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(50.f))); + // Free cond_param[] (16 bytes), Alloc PRED[] (1 byte) + HloInstruction* cond_lt = cond_builder.AddInstruction( + HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, {}), cond_iter, + cond_limit, ComparisonDirection::kLt)); + HloComputation* cond_computation = + module->AddEmbeddedComputation(cond_builder.Build()); + + auto body_builder = HloComputation::Builder("WhileBody"); + // Tuple param: 24 bytes (each elem has 8 byte pointer, 4 byte element) + HloInstruction* body_param = body_builder.AddInstruction( + HloInstruction::CreateParameter(0, tuple_shape, "body_param")); + HloInstruction* body_iter = body_builder.AddInstruction( + HloInstruction::CreateGetTupleElement(scalar_shape, body_param, 1)); + HloInstruction* body_data = body_builder.AddInstruction( + HloInstruction::CreateGetTupleElement(shape, body_param, 0)); + HloInstruction* body_iter_increment = body_builder.AddInstruction( + HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.f))); + HloInstruction* body_iter_next = + body_builder.AddInstruction(HloInstruction::CreateBinary( + scalar_shape, HloOpcode::kAdd, body_iter, body_iter_increment)); + HloInstruction* body_data_increment = + body_builder.AddInstruction(HloInstruction::CreateConstant( + LiteralUtil::CreateR2({{1.f, 2.f, 3.f}, {4.f, 5.f, 6.f}}))); + HloInstruction* body_data_mul = + body_builder.AddInstruction(HloInstruction::CreateBinary( + shape, HloOpcode::kMultiply, body_data, body_data)); + HloInstruction* body_data_add = + body_builder.AddInstruction(HloInstruction::CreateBinary( + shape, HloOpcode::kAdd, body_data, body_data_increment)); + HloInstruction* body_data_next = + body_builder.AddInstruction(HloInstruction::CreateBinary( + shape, HloOpcode::kAdd, body_data_add, body_data_mul)); + HloInstruction* body_out = body_builder.AddInstruction( + HloInstruction::CreateTuple({body_data_next, body_iter_next})); + HloComputation* body_computation = + module->AddEmbeddedComputation(body_builder.Build()); + + auto builder = HloComputation::Builder(TestName()); + HloInstruction* data = builder.AddInstruction( + HloInstruction::CreateParameter(0, shape, "param_iter")); + HloInstruction* iter = builder.AddInstruction( + HloInstruction::CreateParameter(1, scalar_shape, "param_data")); + HloInstruction* tuple = + builder.AddInstruction(HloInstruction::CreateTuple({data, iter})); + HloInstruction* while_op = builder.AddInstruction(HloInstruction::CreateWhile( + tuple_shape, cond_computation, body_computation, tuple)); + HloComputation* entry_computation = + module->AddEntryComputation(builder.Build()); + + HloSchedule schedule(module.get()); + schedule.set_sequence(cond_computation, + {cond_param, cond_iter, cond_limit, cond_lt}); + schedule.set_sequence(body_computation, + {body_param, body_iter, body_data, body_iter_increment, + body_iter_next, body_data_increment, body_data_mul, + body_data_add, body_data_next, body_out}); + schedule.set_sequence(entry_computation, {iter, data, tuple, while_op}); + TF_CHECK_OK(module->set_schedule(schedule)); + + AssignMemorySpace(module.get()); + + // Ensure the tuple value and buffers used in the while instruction are + // exempted from using the alternate memory. However, body_data_mul is + // independent and can be safely be placed in the alternate memory. + EXPECT_THAT(tuple, op::ShapeWithLayout(tuple_shape)); + EXPECT_THAT(data, op::ShapeWithLayout(shape)); + EXPECT_THAT(iter, op::ShapeWithLayout(scalar_shape)); + EXPECT_THAT(body_data, op::ShapeWithLayout(shape)); + EXPECT_THAT(body_iter, op::ShapeWithLayout(scalar_shape)); + EXPECT_THAT(cond_iter, op::ShapeWithLayout(scalar_shape)); + Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithLayout( + F32, {2, 3}, + /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0, + kAlternateMemorySpace); + EXPECT_THAT(body_data_mul, op::ShapeWithLayout(shape_in_alternate_mem)); +} + +} // namespace +} // namespace xla From c53d687df1ffe80ebdd064777b94269942f0fcaa Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Fri, 19 Jul 2019 11:39:14 -0700 Subject: [PATCH 0169/3053] Add verifier to the MLIR round-trip This ensures that the MLIR module is well-formed. Also dump() the MLIR module on failures only. PiperOrigin-RevId: 259007871 --- tensorflow/compiler/mlir/tensorflow/BUILD | 3 ++- .../tensorflow/translate/mlir_roundtrip_pass.cc | 17 ++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD index abe8df63b20..9715a672660 100644 --- a/tensorflow/compiler/mlir/tensorflow/BUILD +++ b/tensorflow/compiler/mlir/tensorflow/BUILD @@ -1,5 +1,5 @@ load("@local_config_mlir//:tblgen.bzl", "gentbl") -load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_native_cc_binary") +load("//tensorflow:tensorflow.bzl", "tf_cc_test", "tf_native_cc_binary") package( default_visibility = [":friends"], @@ -281,6 +281,7 @@ cc_library( "//tensorflow/core:core_cpu_lib", "//tensorflow/core:lib", "//tensorflow/core:protos_all_proto_cc", + "@local_config_mlir//:Analysis", "@local_config_mlir//:IR", "@local_config_mlir//:StandardOps", ], diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc index 3d71910edcd..231a73414ba 100644 --- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc +++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.cc @@ -15,11 +15,12 @@ limitations under the License. #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_pass.h" +#include "mlir/Analysis/Verifier.h" // TF:local_config_mlir +#include "mlir/IR/MLIRContext.h" // TF:local_config_mlir +#include "mlir/IR/Module.h" // TF:local_config_mlir #include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h" #include "tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.h" #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h" -#include "mlir/IR/MLIRContext.h" // TF:local_config_mlir -#include "mlir/IR/Module.h" // TF:local_config_mlir #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/protobuf/graph_debug_info.pb.h" @@ -35,9 +36,15 @@ Status MlirRoundtripPass::Run(const GraphOptimizationPassOptions& options) { TF_ASSIGN_OR_RETURN(auto module, ConvertGraphToMlir(**options.graph, debug_info, *options.flib_def, specs, &context)); - // TODO(jpienaar): Remove, just simple verification that this works. - module->dump(); - return ConvertMlirToGraph(*module, confs, options.graph, options.flib_def); + if (failed(mlir::verify(*module))) { + // TODO(jpienaar): Remove, just simple verification that this works. + module->dump(); + return errors::Internal("Verifier failed on MLIR import for the graph"); + } + auto status = + ConvertMlirToGraph(*module, confs, options.graph, options.flib_def); + if (!status.ok()) module->dump(); + return status; } REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 0, From 83f4a0c638988a040824081d0224964e1684214a Mon Sep 17 00:00:00 2001 From: Trent Lo Date: Fri, 19 Jul 2019 12:05:24 -0700 Subject: [PATCH 0170/3053] Add a guard flag for the new garbage collection feature. --- .../core/common_runtime/bfc_allocator.cc | 25 +++++++++++++------ .../core/common_runtime/bfc_allocator.h | 7 +++++- .../common_runtime/gpu/gpu_bfc_allocator.cc | 24 +++++++++++++++++- .../common_runtime/gpu/gpu_bfc_allocator.h | 1 + 4 files changed, 47 insertions(+), 10 deletions(-) diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc index 1de9cc0b7c5..0d4dbb3cee4 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.cc +++ b/tensorflow/core/common_runtime/bfc_allocator.cc @@ -30,8 +30,10 @@ limitations under the License. namespace tensorflow { BFCAllocator::BFCAllocator(SubAllocator* sub_allocator, size_t total_memory, - bool allow_growth, const string& name) + bool allow_growth, const string& name, + bool garbage_collection) : sub_allocator_(sub_allocator), + garbage_collection_(garbage_collection), name_(name), free_chunks_list_(kInvalidChunkHandle), next_allocation_id_(1) { @@ -261,6 +263,11 @@ size_t BFCAllocator::RoundedBytes(size_t bytes) { } bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes) { + // Do nothing if garbage collection is off. + if (!garbage_collection_) { + return false; + } + // Searching for free regions. absl::flat_hash_set free_region_ptrs; size_t total_free_bytes = 0; @@ -294,13 +301,15 @@ bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes) { return false; } - LOG(WARNING) << "Re-allocate memory regions (i.e., allocations) to avoid OOM" - << " due to memory fragmentation. If you see this message" - << " frequently, you are running near the threshold of the" - << " available device memory and re-allocation can incur great" - << " performance overhead. You may try smaller batch sizes to" - << " observe the performance impact. Alternatively you may try" - << " setting `allow_growth=false` in GPUOptions."; + LOG(WARNING) << "Garbage collection: deallocate free memory regions" + << " (i.e., allocations) so that we can re-allocate a larger" + << " region to avoid OOM due to memory fragmentation. If you" + << " see this message frequently, you are running near the" + << " threshold of the available device memory and re-allocation" + << " may incur great performance overhead. You may try smaller" + << " batch sizes to observe the performance impact." + << " Set TF_ENABLE_GPU_GARBAGE_COLLECTION=false if you'd like to" + << " disable this feature."; // Deallocate free regions. DeallocateRegions(free_region_ptrs); diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h index 606527476ce..f3d922f342b 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.h +++ b/tensorflow/core/common_runtime/bfc_allocator.h @@ -48,7 +48,8 @@ class BFCAllocator : public Allocator { public: // Takes ownership of sub_allocator. BFCAllocator(SubAllocator* sub_allocator, size_t total_memory, - bool allow_growth, const string& name); + bool allow_growth, const string& name, + bool garbage_collection = false); ~BFCAllocator() override; string Name() override { return name_; } @@ -486,6 +487,10 @@ class BFCAllocator : public Allocator { // of the available memory. bool started_backpedal_ = false; + // Whether the allocator will deallocate free regions to avoid OOM due to + // memory fragmentation. + bool garbage_collection_; + std::unique_ptr sub_allocator_; string name_; SharedCounter* timing_counter_ = nullptr; diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc index c284958ee9f..aeb5d33f3ca 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc @@ -52,6 +52,27 @@ bool GPUBFCAllocator::GetAllowGrowthValue(const GPUOptions& gpu_options) { return gpu_options.allow_growth(); } +bool GPUBFCAllocator::GetGarbageCollectionValue() { + const char* enable_gpu_garbage_collection = + std::getenv("TF_ENABLE_GPU_GARBAGE_COLLECTION"); + if (enable_gpu_garbage_collection == nullptr) { + // By default, turn on the memory garbage collection. + return true; + } + if (strcmp("false", enable_gpu_garbage_collection) == 0) { + return false; + } else if (strcmp("true", enable_gpu_garbage_collection) == 0) { + return true; + } + + LOG(ERROR) + << "The TF_ENABLE_GPU_GARBAGE_COLLECTION environment variable is set but" + << " could not be parsed: \"" << enable_gpu_garbage_collection << "\"." + << " Valid values are \"true\" or \"false\"." + << " Using the default value \"true\"."; + return true; +} + GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator, size_t total_memory, const string& name) : GPUBFCAllocator(sub_allocator, total_memory, GPUOptions(), name) {} @@ -61,6 +82,7 @@ GPUBFCAllocator::GPUBFCAllocator(GPUMemAllocator* sub_allocator, const GPUOptions& gpu_options, const string& name) : BFCAllocator(sub_allocator, total_memory, - GPUBFCAllocator::GetAllowGrowthValue(gpu_options), name) {} + GPUBFCAllocator::GetAllowGrowthValue(gpu_options), name, + GPUBFCAllocator::GetGarbageCollectionValue()) {} } // namespace tensorflow diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h index 5cae743115f..0f65abd6e9f 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h +++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h @@ -43,6 +43,7 @@ class GPUBFCAllocator : public BFCAllocator { private: static bool GetAllowGrowthValue(const GPUOptions& gpu_options); + static bool GetGarbageCollectionValue(); }; } // namespace tensorflow From e896b5c85f998043086b69e94ba823af498814a9 Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Fri, 19 Jul 2019 11:46:03 -0700 Subject: [PATCH 0171/3053] Prefer to match BackwardInput convolution. Sometimes a convolution can be both matched as a backward input convolution and a backward filter convolution. If we match it as backward input convolution, we can replace the reverse operation also with the cudnn call. PiperOrigin-RevId: 259009067 --- .../xla/service/gpu/cudnn_conv_rewriter.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc index e81850db69e..a900fc462bb 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc @@ -504,6 +504,13 @@ StatusOr RunOnInstruction(HloInstruction* conv) { ConvolutionDimensionNumbers dnums; HloInstruction* rhs; + std::tie(match, window, dnums, rhs) = MatchBackwardInput(conv); + if (match) { + return CreateCudnnConv(kCudnnConvBackwardInputCallTarget, conv->shape(), + conv->mutable_operand(0), rhs, window, dnums, + conv->feature_group_count(), conv->metadata()); + } + std::tie(match, window, dnums) = MatchBackwardFilter(conv); if (match) { return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, conv->shape(), @@ -512,13 +519,6 @@ StatusOr RunOnInstruction(HloInstruction* conv) { conv->metadata()); } - std::tie(match, window, dnums, rhs) = MatchBackwardInput(conv); - if (match) { - return CreateCudnnConv(kCudnnConvBackwardInputCallTarget, conv->shape(), - conv->mutable_operand(0), rhs, window, dnums, - conv->feature_group_count(), conv->metadata()); - } - // If all else fails, try a forward convolution. if (CanImplementAsCudnnForwardConv(conv)) { return CreateCudnnConv(kCudnnConvForwardCallTarget, conv->shape(), From dca4035863d923d16df19ac935c38c63d16f6406 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 19 Jul 2019 11:48:25 -0700 Subject: [PATCH 0172/3053] Automated rollback of commit c1544732dd66a20eafe1add9737da07081c1e03d PiperOrigin-RevId: 259009498 --- tensorflow/lite/delegates/gpu/common/operations.cc | 3 --- tensorflow/lite/delegates/gpu/common/operations.h | 7 ------- tensorflow/lite/delegates/gpu/metal/api.cc | 1 - 3 files changed, 11 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/common/operations.cc b/tensorflow/lite/delegates/gpu/common/operations.cc index 8a8e80e3f12..eb1f01804df 100644 --- a/tensorflow/lite/delegates/gpu/common/operations.cc +++ b/tensorflow/lite/delegates/gpu/common/operations.cc @@ -110,8 +110,6 @@ std::string ToString(enum OperationType op) { return "soft_max"; case OperationType::SPACE_TO_BATCH: return "space_to_batch"; - case OperationType::STRETCH_TIME: - return "stretch_time"; case OperationType::SQRT: return "sqrt"; case OperationType::SQUARE: @@ -161,7 +159,6 @@ OperationType OperationTypeFromString(const std::string& name) { {"sin", OperationType::SIN}, {"slice", OperationType::SLICE}, {"soft_max", OperationType::SOFT_MAX}, - {"stretch_time", OperationType::STRETCH_TIME}, {"sqrt", OperationType::SQRT}, {"square", OperationType::SQUARE}, {"subtract", OperationType::SUB}, diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h index 3e2b36ed8f4..5e564f6763c 100644 --- a/tensorflow/lite/delegates/gpu/common/operations.h +++ b/tensorflow/lite/delegates/gpu/common/operations.h @@ -65,7 +65,6 @@ enum class OperationType { SLICE, SOFT_MAX, SPACE_TO_BATCH, - STRETCH_TIME, SQRT, SQUARE, SQUARED_DIFF, @@ -133,12 +132,6 @@ struct MaxUnpooling2DAttributes { Padding2D padding; }; -struct StretchTimeAttributes { - Axis axis; - int32_t factor; - HW slice; -}; - struct ConcatAttributes { // Defines axis by which to concat on. Axis axis = Axis::UNKNOWN; diff --git a/tensorflow/lite/delegates/gpu/metal/api.cc b/tensorflow/lite/delegates/gpu/metal/api.cc index 03e9efa8075..ae0b8c485ea 100644 --- a/tensorflow/lite/delegates/gpu/metal/api.cc +++ b/tensorflow/lite/delegates/gpu/metal/api.cc @@ -263,7 +263,6 @@ Status Compile(const GraphFloat32& graph, const RuntimeOptions& options, case OperationType::MUL: case OperationType::RESIZE: case OperationType::SPACE_TO_BATCH: - case OperationType::STRETCH_TIME: case OperationType::UNKNOWN: return UnimplementedError("Unsupported op: " + node->operation.type); } From 2c7437a5be1c7a28c45a7691eebef3423425a85f Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Fri, 19 Jul 2019 12:27:03 -0700 Subject: [PATCH 0173/3053] Fix the serialization test for CacheDatasetOp --- .../cache_dataset_serialization_test.py | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/cache_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/cache_dataset_serialization_test.py index 2bcf77f5d8a..0f86e44e281 100644 --- a/tensorflow/python/data/experimental/kernel_tests/serialization/cache_dataset_serialization_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/serialization/cache_dataset_serialization_test.py @@ -85,24 +85,14 @@ class CacheDatasetSerializationTest( ds_fn, [5], 8, verify_exhausted=False, save_checkpoint_at_end=False) self.assertSequenceEqual(outputs, range(8)) - if is_memory: - outputs = outputs[:5] - outputs.extend( - self.gen_outputs( - ds_fn, [], - self.num_outputs - 5, - ckpt_saved=True, - verify_exhausted=False)) - self.assertSequenceEqual(outputs, self.expected_outputs()) - else: - # Restoring from checkpoint and running GetNext should return - # `AlreadExistsError` now because the lockfile already exists. - with self.assertRaises(errors.AlreadyExistsError): + outputs = outputs[:5] + outputs.extend( self.gen_outputs( ds_fn, [], self.num_outputs - 5, ckpt_saved=True, - verify_exhausted=False) + verify_exhausted=False)) + self.assertSequenceEqual(outputs, self.expected_outputs()) @parameterized.named_parameters( ('Memory', True), From 86ba7c71113817cc8ae4da905da10997f07ab914 Mon Sep 17 00:00:00 2001 From: Juhyun Lee Date: Fri, 19 Jul 2019 12:06:04 -0700 Subject: [PATCH 0174/3053] TFLite GPU: Clean up model_builder a bit. Specifically, - Rename from CheckActivationSupported to IsSupported. - Replace switch default to specific cases. - Rename template variable from ParamsType to ParamsT. PiperOrigin-RevId: 259012929 --- .../delegates/gpu/common/model_builder.cc | 60 ++++++++++--------- 1 file changed, 31 insertions(+), 29 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc index 986cbe5d5b7..c8c8f8e2657 100644 --- a/tensorflow/lite/delegates/gpu/common/model_builder.cc +++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc @@ -399,19 +399,21 @@ class TFLiteOperationParser { const TfLiteRegistration* registration) = 0; }; -Status CheckActivationSupported(TfLiteFusedActivation fused_activation) { - if (fused_activation == kTfLiteActNone) { - return OkStatus(); - } +Status IsActivationSupported(TfLiteFusedActivation fused_activation) { switch (fused_activation) { + case kTfLiteActNone: case kTfLiteActRelu: case kTfLiteActRelu1: case kTfLiteActRelu6: case kTfLiteActTanh: return OkStatus(); - default: - return NotFoundError(absl::StrFormat("Unsupported fused activation: %d.", - fused_activation)); + case kTfLiteActSignBit: + return UnimplementedError("TfLiteFusedActivation.kTfLiteActSignBit"); + case kTfLiteActSigmoid: + return UnimplementedError("TfLiteFusedActivation.kTfLiteActSigmoid"); + + // Do not add default; we want compilation error rather than run-time + // error. } } @@ -497,15 +499,15 @@ Status GetFullyConnectedAttributes(int weights_tensor_id, int bias_tensor_id, return OkStatus(); } -template +template Status RetrieveBuiltinData(const TfLiteNode* tflite_node, - ParamsType** tf_options) { + ParamsT** tf_options) { const auto* params = - reinterpret_cast(tflite_node->builtin_data); + reinterpret_cast(tflite_node->builtin_data); if (!params) { return InternalError("Unable to retrieve builtin_data."); } - *tf_options = const_cast(params); + *tf_options = const_cast(params); return OkStatus(); } @@ -599,8 +601,7 @@ class Conv2DOperationParser : public TFLiteOperationParser { RETURN_IF_ERROR(CheckStridesAndDilation( tf_options->stride_height, tf_options->stride_width, tf_options->dilation_height_factor, tf_options->dilation_width_factor)); - RETURN_IF_ERROR(CheckActivationSupported(tf_options->activation)); - return OkStatus(); + return IsActivationSupported(tf_options->activation); } Status Parse(const TfLiteNode* tflite_node, @@ -784,8 +785,7 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser { RETURN_IF_ERROR(CheckStridesAndDilation( tf_options->stride_height, tf_options->stride_width, tf_options->dilation_height_factor, tf_options->dilation_width_factor)); - RETURN_IF_ERROR(CheckActivationSupported(tf_options->activation)); - return OkStatus(); + return IsActivationSupported(tf_options->activation); } Status Parse(const TfLiteNode* tflite_node, @@ -892,8 +892,7 @@ class Pooling2DOperationParser : public TFLiteOperationParser { RETURN_IF_ERROR(CheckKernelsAndStrides( tf_options->filter_height, tf_options->filter_width, tf_options->stride_height, tf_options->stride_width)); - RETURN_IF_ERROR(CheckActivationSupported(tf_options->activation)); - return OkStatus(); + return IsActivationSupported(tf_options->activation); } public: @@ -1307,22 +1306,25 @@ class ElementwiseOperationParser : public TFLiteOperationParser { public: explicit ElementwiseOperationParser(OperationType operation_type) : operation_type_(operation_type) {} + Status IsSupported(const TfLiteContext* context, const TfLiteNode* tflite_node, const TfLiteRegistration* registration) final { RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); - if (IsTwoArgumentOperation()) { + TfLiteSubParams* tf_options; + RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); + if (IsOneArgumentOperation()) { + RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/1, + /*outputs=*/1)); + } else if (IsTwoArgumentOperation()) { RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/2, /*outputs=*/1)); - TfLiteSubParams* tf_options = nullptr; - RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); - RETURN_IF_ERROR(CheckActivationSupported(tf_options->activation)); - } else if (!IsOneArgumentOperation()) { - return InvalidArgumentError("Incorrect operation type passed"); + } else { + return InvalidArgumentError("Op can only handle 1 or 2 operand(s)."); } - - return OkStatus(); + return IsActivationSupported(tf_options->activation); } + Status Parse(const TfLiteNode* tflite_node, const TfLiteRegistration* registration, GraphFloat32* graph, ObjectReader* reader) final { @@ -1376,13 +1378,13 @@ class ElementwiseOperationParser : public TFLiteOperationParser { bool IsOneArgumentOperation() const { switch (operation_type_) { case OperationType::ABS: - case OperationType::SIN: case OperationType::COS: case OperationType::LOG: - case OperationType::SQRT: case OperationType::RSQRT: - case OperationType::SQUARE: case OperationType::SIGMOID: + case OperationType::SIN: + case OperationType::SQRT: + case OperationType::SQUARE: case OperationType::TANH: return true; default: @@ -1392,10 +1394,10 @@ class ElementwiseOperationParser : public TFLiteOperationParser { bool IsTwoArgumentOperation() const { switch (operation_type_) { - case OperationType::SUB: case OperationType::DIV: case OperationType::POW: case OperationType::SQUARED_DIFF: + case OperationType::SUB: return true; default: return false; From be42a1eb1202792cbbb18941384cfacc7ef5dd67 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Fri, 19 Jul 2019 12:08:38 -0700 Subject: [PATCH 0175/3053] Fix internal attribute name used by grappler arithmetic optimizer (NFC) Attributes are supposed to match the regex "[a-z][a-z0-9_]+" according to the documentation in op_def.proto PiperOrigin-RevId: 259013399 --- tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc index 273460050fc..ebf704c0718 100644 --- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc @@ -62,9 +62,9 @@ namespace { // Mark nodes created or optimized by a stage with a tag. constexpr char kAddOpsRewriteTag[] = - "_grappler:ArithmeticOptimizer:AddOpsRewriteStage"; + "_grappler_ArithmeticOptimizer_AddOpsRewriteStage"; constexpr char kMinimizeBroadcastsTag[] = - "_grappler:ArithmeticOptimizer:MinimizeBroadcasts"; + "_grappler_ArithmeticOptimizer_MinimizeBroadcasts"; // Extract values from a Const op to `values`. Returns true if succeeds. template From 84d5ed5ba64d1e3176f4db6682bb912ce797ffa1 Mon Sep 17 00:00:00 2001 From: Bixia Zheng Date: Fri, 19 Jul 2019 12:12:05 -0700 Subject: [PATCH 0176/3053] [XLA] Add support for exhaustive test of operations with more than 32 bit input. For operations that require 64 bits or more input data, we can't actually exhaustively test all input bit patterns. Instead, we define a data structure, FpValues, for a test to specify a subset of bit patterns being test. Add exhaustive tests for transcendental operations of F64, C64 and C128. PiperOrigin-RevId: 259014020 --- .../xla/tests/exhaustive_op_test_utils.h | 413 +++++++++++++++++- .../xla/tests/exhaustive_unary_test.cc | 341 ++++++++++++++- 2 files changed, 748 insertions(+), 6 deletions(-) diff --git a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h b/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h index 3df4de295e3..956e1694fb7 100644 --- a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h +++ b/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h @@ -45,7 +45,13 @@ class ExhaustiveOpTestBase : public ClientLibraryTestBase { // `ty` is the primitive type being tested. explicit ExhaustiveOpTestBase(PrimitiveType ty) - : ty_(ty), platform_(client_->platform()->Name()) {} + : ty_(ty), platform_(client_->platform()->Name()) { + SetFastMathDisabled(true); + + // Run all HLO passes. In particular, constant folding is disabled by + // default for tests, but we need to run it in order to tickle some bugs. + mutable_debug_options()->clear_xla_disable_hlo_passes(); + } // Builds and runs the computation using the LocalClient API, rather than the // plain Client API, which is used by ClientLibraryTestBase. This is because @@ -227,5 +233,410 @@ class ExhaustiveOpTestBase : public ClientLibraryTestBase { bool relaxed_denormal_signs_ = platform_ != "CUDA"; }; +// Represents a set of 64 bit chunks by representing the starting bit chunk, +// the last bit chunk, and the spacing between two adjacent bit chunks, without +// actually storing all the bit chunks being generated. The bit chunk iterator +// is provided to retrieve all the bit chunks. +// +// This data structure is used to generate the bit representation to test +// operations that requires more than 64 bit input data. In this case, +// truly exhaustive testing is not possible and we want to test a value every +// n values, where n == spacing_. +// +// Currently, the iterator of BitChunks adds the `spacing_` to a bit chunk to +// compute the next bit chunk. We can change this to use values generated +// by a random number generator that can achieve the average spacing +// statistically, if we will find this is necessary. +class BitChunks { + public: + class iterator + : public std::iterator { + public: + iterator() {} + + explicit iterator(const BitChunks* bit_chunks) + : bit_chunks_(bit_chunks), next_bit_chunk_(bit_chunks->start_) {} + + iterator& operator++() { + Next(); + return *this; + } + + iterator operator++(int) { + iterator retval = *this; + Next(); + return retval; + } + + bool operator==(iterator other) const { + return bit_chunks_ == other.bit_chunks_ && + next_bit_chunk_ == other.next_bit_chunk_; + } + + bool operator!=(iterator other) const { return !(*this == other); } + + iterator MoveToEnd() { + MoveNextBitChunkToOnePassEnd(); + return *this; + } + + reference operator*() const { + CHECK(*this != this->bit_chunks_->end()); + return next_bit_chunk_; + } + + const BitChunks* GetBitChunks() const { return bit_chunks_; } + + void Reset() { next_bit_chunk_ = bit_chunks_->start_; } + + void Next() { + CHECK(*this != this->bit_chunks_->end()); + if (next_bit_chunk_ == bit_chunks_->end_) { + MoveNextBitChunkToOnePassEnd(); + } else { + next_bit_chunk_ += bit_chunks_->spacing_; + if (next_bit_chunk_ > bit_chunks_->end_) { + next_bit_chunk_ = bit_chunks_->end_; + } + } + } + + std::string ToString() const { + return absl::StrFormat("0x%08x", next_bit_chunk_); + } + + private: + // Move next_bit_chunk_ to 1 pass the bit_chunks_->end, to mark that the + // iterator has reached the end. When spacing_ is not one, or if we will + // change to use a random value instead of spacing_ in function Next(), + // normalizing the representation of the iterator ending this way can + // can simplify the checking for iterator ending. + void MoveNextBitChunkToOnePassEnd() { + next_bit_chunk_ = bit_chunks_->end_ + 1; + } + + const BitChunks* bit_chunks_; + uint64 next_bit_chunk_; + }; + + iterator begin() const { return iterator(this); } + iterator end() const { + iterator end(this); + return end.MoveToEnd(); + } + + explicit BitChunks(uint64 start = 0, uint64 end = 0, uint64 spacing = 1) + : start_(start), end_(end), spacing_(spacing) { + CHECK_GE(end_, start_); + CHECK_NE(spacing, 0) << ToString(); + } + + int64 GetTotalBitChunks() const { + if (start_ == end_) { + return 1; + } + + return 1 + (end_ - start_ + spacing_ - 1) / spacing_; + } + + std::string ToString() const { + return absl::StrFormat("(0x%08x, 0x%08x, 0x%08x)", start_, end_, spacing_); + } + + uint64 start_; + uint64 end_; + uint64 spacing_; +}; + +inline string StringifyNum(BitChunks c) { return c.ToString(); } + +inline string StringifyNum(BitChunks::iterator c) { return c.ToString(); } + +template +void AppendStringifyNum(std::string* s, T x) { + absl::StrAppend(s, StringifyNum(x)); +} + +// Represents a set of floating point values through the possible values for +// the three components: mantissa, exponent, and sign. Also implements an +// iterator for retrieving all the represented floating point values. +class FpValues { + public: + static constexpr uint kTotalBitChunks = 3; + + class iterator + : public std::iterator { + public: + explicit iterator(const FpValues* fp_values) : fp_values_(fp_values) { + for (int i = 0; i < FpValues::kTotalBitChunks; ++i) { + iters_[i] = BitChunks::iterator(&fp_values->GetBitChunks(i)); + } + } + + iterator& operator++() { + Next(); + return *this; + } + + iterator operator++(int) { + iterator retval = *this; + Next(); + return retval; + } + + bool operator==(iterator other) const { + for (int i = 0; i < FpValues::kTotalBitChunks; ++i) { + if (iters_[i] != other.GetBitChunksIter(i)) { + return false; + } + } + return true; + } + + bool operator!=(iterator other) const { return !(*this == other); } + + iterator MoveToEnd() { + for (int i = 0; i < FpValues::kTotalBitChunks; ++i) { + iters_[i].MoveToEnd(); + } + return *this; + } + + uint64 operator*() const { + uint64 value = 0; + for (int i = 0; i < FpValues::kTotalBitChunks; ++i) { + value = value | (*iters_[i]) << fp_values_->offsets_[i]; + } + return value; + } + + const BitChunks::iterator& GetBitChunksIter(int i) { return iters_[i]; } + + std::string ToString() const { + return absl::StrJoin(iters_, ",", + AppendStringifyNum); + } + + private: + // Moves the iterator for the ith BitChunks to the next value, and + // returns true if the new state is not the end of the iterator. + bool Next(int i = 0) { + iters_[i].Next(); + if (iters_[i] == iters_[i].GetBitChunks()->end()) { + if (i == FpValues::kTotalBitChunks - 1) { + return false; + } + if (Next(i + 1)) { + iters_[i].Reset(); + return true; + } + return false; + } + return true; + } + + std::array iters_; + const FpValues* fp_values_; + }; + + FpValues(absl::Span chunks, absl::Span offsets) { + CHECK_EQ(chunks.size(), offsets.size() - 1); + CHECK_EQ(chunks.size(), kTotalBitChunks); + std::copy_n(chunks.begin(), kTotalBitChunks, bit_chunks_.begin()); + std::copy_n(offsets.begin(), kTotalBitChunks, offsets_.begin()); + + // The last value in `offsets` is the total number of bits. + offsets_[kTotalBitChunks] = offsets[kTotalBitChunks]; + // Validate the input values. + for (int i = 0; i < kTotalBitChunks; ++i) { + int total_bits = offsets[i + 1] - offsets[i]; + if (total_bits < 64) { + uint64 bound = 1ull << total_bits; + CHECK_LT(chunks[i].start_, bound); + CHECK_LT(chunks[i].end_, bound); + } else { + CHECK_EQ(total_bits, 64); + } + } + } + + iterator begin() const { return iterator(this); } + + iterator end() const { + iterator end(this); + return end.MoveToEnd(); + } + + int64 GetTotalNumValues() const { + int64 total = 1; + absl::c_for_each(bit_chunks_, [&](const BitChunks& chunks) { + total *= chunks.GetTotalBitChunks(); + }); + return total; + } + + const BitChunks& GetBitChunks(int i) const { return bit_chunks_[i]; } + + std::string ToString() const { + return absl::StrCat( + "[", absl::StrJoin(bit_chunks_, ",", AppendStringifyNum), + "]"); + } + + std::array bit_chunks_; + std::array offsets_; +}; + +template +int GetMantissaTotalBits() { + static_assert(std::is_same::value || std::is_same::value, + "Only supports float and double."); + return std::numeric_limits::digits - 1; +} + +template +int GetFpTotalBits() { + return sizeof(T) * 8; +} + +template +int GetExponentTotalBits() { + return GetFpTotalBits() - GetMantissaTotalBits() - 1; +} + +template +uint64 GetAllOneMantissa() { + return (1ull << GetMantissaTotalBits()) - 1ull; +} + +template +uint64 GetAllOneExponent() { + return (1ull << GetExponentTotalBits()) - 1ull; +} + +template +FpValues GetFpValues(BitChunks mantissa, BitChunks exponent, BitChunks sign) { + static_assert(std::is_same::value || std::is_same::value, + "Only supports float and double."); + int total_bits = GetFpTotalBits(); + return FpValues({mantissa, exponent, sign}, + {0, GetMantissaTotalBits(), total_bits - 1, total_bits}); +} + +template +FpValues GetZeros() { + return GetFpValues(BitChunks(0, 0, 1), BitChunks(0, 0, 1), + BitChunks(0, 1, 1)); +} + +template +FpValues GetSubnormals(int approx_num_values) { + int mantissa = GetMantissaTotalBits(); + uint64 mantissa_spacing = (1ull << mantissa) / (approx_num_values * 2); + return GetFpValues( + BitChunks(0x1, GetAllOneMantissa(), mantissa_spacing), + BitChunks(0, 0, 1), BitChunks(0, 1, 1)); +} + +template +FpValues GetInfinites() { + uint64 all_one_exp = GetAllOneExponent(); + return GetFpValues(BitChunks(0, 0, 1), + BitChunks(all_one_exp, all_one_exp, 1), + BitChunks(0, 1, 1)); +} + +template +FpValues GetNans(int approx_num_values) { + int mantissa = GetMantissaTotalBits(); + uint64 mantissa_spacing = (1ull << mantissa) / (approx_num_values * 2); + uint64 all_one_exp = GetAllOneExponent(); + return GetFpValues( + BitChunks(0x1, GetAllOneMantissa(), mantissa_spacing), + BitChunks(all_one_exp, all_one_exp, 1), BitChunks(0, 1, 1)); +} + +template +FpValues GetNormals(int approx_num_values) { + float component_total = std::sqrtf(approx_num_values); + return GetFpValues( + BitChunks(0x1, GetAllOneMantissa(), + (1ull << (GetMantissaTotalBits() + 1)) / component_total), + BitChunks(0x1, GetAllOneExponent() - 1, + (1ull << (GetExponentTotalBits() + 1)) / component_total), + BitChunks(0, 1, 1)); +} + +// Returns a vector of FpValues, which together represent about +// `approx_num_values` floating point values of type `T`, with each FpValues +// represents about `num_values_per_group` floating point values. +template +std::vector GetFpValuesWithExponents(uint64 first_exponent, + uint64 exponent_spacing, + uint64 num_exponents, + uint64 approx_num_values, + uint64 num_values_per_group) { + const uint64 num_signs = 2; + uint64 approx_num_mantissa = approx_num_values / (num_exponents * num_signs); + uint64 num_mantissa_per_group = + num_values_per_group / (num_exponents * num_signs); + CHECK_GT(approx_num_mantissa, 0); + CHECK_GT(num_mantissa_per_group, 0); + + CHECK_LT(first_exponent + num_exponents - 1ull, GetAllOneExponent()); + int mantissa = GetMantissaTotalBits(); + uint64 mantissa_spacing = (1ull << mantissa) / approx_num_mantissa; + + std::vector result; + for (uint64 group_start = 0; group_start < GetAllOneMantissa(); + group_start += mantissa_spacing * num_mantissa_per_group) { + uint64 group_end = + group_start + (num_mantissa_per_group - 1) * mantissa_spacing; + if (group_end > GetAllOneMantissa()) { + group_end = GetAllOneMantissa(); + } + result.push_back(GetFpValues( + BitChunks(group_start, group_end, mantissa_spacing), + BitChunks(first_exponent, first_exponent + num_exponents - 1, 1), + BitChunks(0, 1, 1))); + } + return result; +} + +// Returns a vector of FpValues together represent about `approx_num_values` +// "very large" floating point values and `approx_num_values` "very small" +// floating point values of type `T`, which each FpValues represent about +// `num_values_per_group` floating point values. Because we use FpValues as +// a parameter for parameterized testing, the number of floating values +// represented by each FpValues affects the input size for each sub-test and +// the hence the peak memory usage of the test. +template +std::vector GetFpValuesForMagnitudeExtremeNormals( + uint64 approx_num_values = 40000, uint64 num_values_per_group = 4000) { + std::vector large = + GetFpValuesWithExponents(GetAllOneExponent() - 5, 1, 5, + approx_num_values / 2, num_values_per_group); + std::vector small = GetFpValuesWithExponents( + 1, 1, 5, approx_num_values / 2, num_values_per_group); + large.insert(large.end(), small.begin(), small.end()); + return large; +} + +template +std::vector CreateFpValuesForBoundaryTest() { + return {GetZeros(), GetSubnormals(1000), GetInfinites(), + GetNans(1000)}; +} + } // namespace xla #endif // TENSORFLOW_COMPILER_XLA_TESTS_EXHAUSTIVE_OP_TEST_UTILS_H_ diff --git a/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc b/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc index 0186d7d668d..5f82af95245 100644 --- a/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc +++ b/tensorflow/compiler/xla/tests/exhaustive_unary_test.cc @@ -326,11 +326,6 @@ class Exhaustive32BitOrLessUnaryTest void Run(std::function enqueue_op, F32EvaluateOp evaluate_op, std::function error_spec_gen) { - SetFastMathDisabled(true); - - // Run all HLO passes. In particular, constant folding is disabled by - // default for tests, but we need to run it in order to tickle some bugs. - mutable_debug_options()->clear_xla_disable_hlo_passes(); Literal input_literal = CreateInputLiteral(); switch (ty_) { case F32: @@ -708,4 +703,340 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values(std::make_pair(0, 1 << 16)))); #endif +// Exhaustive test for unary operations for double. +// +// Test parameter is a tuple containing +// - primitive type under test, +// - FpValues representing a set of double values. +class ExhaustiveF64UnaryTest : public ExhaustiveRealUnaryTestBase, + public ::testing::WithParamInterface< + std::tuple> { + public: + typedef double (*F64EvaluateOp)(double); + + ExhaustiveF64UnaryTest() + : ExhaustiveRealUnaryTestBase(std::get<0>(GetParam())) {} + + void Run(std::function enqueue_op, F64EvaluateOp evaluate_op) { + return Run(enqueue_op, evaluate_op, GetDefaultSpecGenerator(ty_)); + } + + void Run(std::function enqueue_op, F64EvaluateOp evaluate_op, + std::function error_spec_gen) { + CHECK_EQ(ty_, F64); + Literal input_literal = CreateInputLiteral(); + FillInputF64(&input_literal); + RunImpl(enqueue_op, evaluate_op, input_literal, + error_spec_gen); + } + + private: + int64 GetInputSize() override { + FpValues values = std::get<1>(GetParam()); + return values.GetTotalNumValues(); + } + + void FillInputF64(Literal* input_literal) { + FpValues fp_values = std::get<1>(GetParam()); + int64 input_size = input_literal->element_count(); + LOG(INFO) << "Checking fp values " << fp_values.ToString() << ", " + << input_size; + absl::Span input_arr = input_literal->data(); + + uint64 i = 0; + for (auto bits : fp_values) { + input_arr[i] = ConvertAndReplaceKnownIncorrectValueWith(bits, 1); + ++i; + } + CHECK_EQ(i, input_size); + } +}; + +XLA_TEST_P(ExhaustiveF64UnaryTest, Log) { Run(Log, std::log); } + +// TODO(bixia): add other unary ops for double + +#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64) +INSTANTIATE_TEST_SUITE_P( + SpecialValues, ExhaustiveF64UnaryTest, + ::testing::Combine( + ::testing::Values(F64), + ::testing::ValuesIn(CreateFpValuesForBoundaryTest()))); + +INSTANTIATE_TEST_SUITE_P( + NormalValues, ExhaustiveF64UnaryTest, + ::testing::Combine(::testing::Values(F64), + ::testing::Values(GetNormals(1000)))); + +// Tests a total of 4000000000 inputs, with 16000000 inputs in each sub-test, to +// keep the peak memory usage low. +INSTANTIATE_TEST_SUITE_P( + LargeAndSmallMagnituedNormalValues, ExhaustiveF64UnaryTest, + ::testing::Combine( + ::testing::Values(F64), + ::testing::ValuesIn(GetFpValuesForMagnitudeExtremeNormals( + 4000000000ull, 16000000)))); +#endif + +class ExhaustiveComplexUnaryTestBase : public ExhaustiveOpTestBase { + public: + explicit ExhaustiveComplexUnaryTestBase(PrimitiveType ty) + : ExhaustiveOpTestBase(ty) {} + + // A helper for implementing the Run method for unary op test of complex + // numbers. + // + // T is the component type of the complex number. + template + void Run(std::function enqueue_op, + std::complex (*evaluate_op)(std::complex), + FpValues* values_real, FpValues* values_imag, + std::function error_spec_gen) { + Literal input_literal = CreateInputLiteral(); + + FillInput(&input_literal, values_real, values_imag); + + XlaBuilder builder(TestName()); + auto input = Parameter(&builder, 0, input_literal.shape(), "input"); + enqueue_op(input); + TF_ASSERT_OK_AND_ASSIGN(XlaComputation comp, builder.Build()); + TF_ASSERT_OK_AND_ASSIGN(Literal result_literal, + RunComputation(comp, {&input_literal})); + ExpectNearComplex(input_literal, result_literal, evaluate_op, + error_spec_gen); + } + + // Generates the input complex literal given the FpValues representation for + // the real and imaginary components. + // + // T is the component type of the complex number. + template + void FillInput(Literal* input_literal, FpValues* real_values, + FpValues* imag_values) { + VLOG(2) << " testing input total " + << real_values->GetTotalNumValues() * + imag_values->GetTotalNumValues() + << ", range " << real_values->ToString() << " " + << imag_values->ToString(); + + absl::Span> input_arr = + input_literal->data>(); + + uint64 i = 0; + for (auto real : *real_values) { + for (auto imag : *imag_values) { + input_arr[i] = std::complex( + ConvertAndReplaceKnownIncorrectValueWith(real, 1), + ConvertAndReplaceKnownIncorrectValueWith(imag, 1)); + + ++i; + } + } + } + + template + void ExpectNearComplex(const Literal& input_literal, + const Literal& result_literal, + std::complex (*evaluate_op)(std::complex), + std::function error_spec_gen) { + absl::Span> input_arr = + input_literal.data>(); + absl::Span> result_arr = + result_literal.data>(); + ASSERT_EQ(result_arr.size(), input_arr.size()); + int64 mismatches = 0; + + for (int64 i = 0; i < input_arr.size(); ++i) { + std::complex input = input_arr[i]; + std::complex actual = result_arr[i]; + std::complex expected = evaluate_op(input); + + // TODO(bixia): Need to fix error_spec_gen to consider both components. + // This only affects the value specific error_spec, and before we fix + // this, it means complex operation testing doesn't support value + // specific error_spec yet. We delay the fix to this partially because + // we don't know whether it is enough for the error_spec to only take + // the absolute value of the complex number. + ErrorSpec error_spec = error_spec_gen(input.real()); + + if (IsClose(expected.real(), actual.real(), error_spec) && + IsClose(expected.imag(), actual.imag(), error_spec)) { + continue; + } + + // TODO(bixia): Need to handle complex operands with subnormals in + // real and/or imaginary components. + VLOG(2) << "calculate " << StringifyNum(input) << " ;" + << StringifyNum(actual) << "; " << StringifyNum(expected); + + PrintMismatch(&mismatches, [&] { + return absl::StrFormat("Mismatch on %s. Expected %s, but got %s.", + StringifyNum(input), StringifyNum(expected), + StringifyNum(actual)); + }); + } + + EXPECT_EQ(mismatches, 0); + } +}; + +// Unary op test for complex. +// +// Test parameter is a tuple containing +// - primitive type under test, +// - two FpValues representing the values for the real and imaginary +// components. The complex numbers for the test input is the cartesian +// product of the values represented by the two FpValues. +class ExhaustiveC64UnaryTest + : public ExhaustiveComplexUnaryTestBase, + public ::testing::WithParamInterface< + std::tuple> { + public: + typedef complex64 (*C64EvaluateOp)(complex64); + + ExhaustiveC64UnaryTest() + : ExhaustiveComplexUnaryTestBase(std::get<0>(GetParam())) {} + + void Run(std::function enqueue_op, C64EvaluateOp evaluate_op) { + return Run(enqueue_op, evaluate_op, GetDefaultSpecGenerator(ty_)); + } + + void Run(std::function enqueue_op, C64EvaluateOp evaluate_op, + std::function error_spec_gen) { + FpValues values_real = std::get<1>(GetParam()); + FpValues values_imag = std::get<2>(GetParam()); + ExhaustiveComplexUnaryTestBase::Run( + enqueue_op, evaluate_op, &values_real, &values_imag, error_spec_gen); + } + + int64 GetInputSize() override { + FpValues values_real = std::get<1>(GetParam()); + FpValues values_imag = std::get<2>(GetParam()); + return values_real.GetTotalNumValues() * values_imag.GetTotalNumValues(); + } +}; + +INSTANTIATE_TEST_SUITE_P( + F32SpecialValues, ExhaustiveC64UnaryTest, + ::testing::Combine( + ::testing::Values(C64), + ::testing::ValuesIn(CreateFpValuesForBoundaryTest()), + ::testing::ValuesIn(CreateFpValuesForBoundaryTest()))); + +INSTANTIATE_TEST_SUITE_P( + F32SpecialAndNormalValues, ExhaustiveC64UnaryTest, + ::testing::Combine( + ::testing::Values(C64), + ::testing::ValuesIn(CreateFpValuesForBoundaryTest()), + ::testing::Values(GetNormals(10000)))); + +INSTANTIATE_TEST_SUITE_P( + F32NormalAndSpecialValues, ExhaustiveC64UnaryTest, + ::testing::Combine( + ::testing::Values(C64), ::testing::Values(GetNormals(10000)), + ::testing::ValuesIn(CreateFpValuesForBoundaryTest()))); + +INSTANTIATE_TEST_SUITE_P( + F32NormalAndNormalValues, ExhaustiveC64UnaryTest, + ::testing::Combine(::testing::Values(C64), + ::testing::Values(GetNormals(10000)), + ::testing::Values(GetNormals(10000)))); + +// Tests a total of 40000 ^ 2 inputs, with 4000 ^ 2 inputs in each sub-test, to +// keep the peak memory usage low. +INSTANTIATE_TEST_SUITE_P( + F32LargeAndSmallMagnituedNormalValues, ExhaustiveC64UnaryTest, + ::testing::Combine( + ::testing::Values(C64), + ::testing::ValuesIn(GetFpValuesForMagnitudeExtremeNormals(40000, + 4000)), + ::testing::ValuesIn( + GetFpValuesForMagnitudeExtremeNormals(40000, 4000)))); + +// Unary op test for complex. +// +// Test parameter is a tuple containing +// - primitive type under test, +// - two FpValues representing the values for the real and imaginary +// components. The complex numbers for the test input is the cartesian +// product of the values represented by the two FpValues. +class ExhaustiveC128UnaryTest + : public ExhaustiveComplexUnaryTestBase, + public ::testing::WithParamInterface< + std::tuple> { + public: + typedef complex128 (*C128EvaluateOp)(complex128); + + ExhaustiveC128UnaryTest() + : ExhaustiveComplexUnaryTestBase(std::get<0>(GetParam())) {} + + void Run(std::function enqueue_op, C128EvaluateOp evaluate_op) { + return Run(enqueue_op, evaluate_op, GetDefaultSpecGenerator(ty_)); + } + + void Run(std::function enqueue_op, C128EvaluateOp evaluate_op, + std::function error_spec_gen) { + FpValues values_real = std::get<1>(GetParam()); + FpValues values_imag = std::get<2>(GetParam()); + ExhaustiveComplexUnaryTestBase::Run( + enqueue_op, evaluate_op, &values_real, &values_imag, error_spec_gen); + } + + int64 GetInputSize() override { + FpValues values_real = std::get<1>(GetParam()); + FpValues values_imag = std::get<2>(GetParam()); + return values_real.GetTotalNumValues() * values_imag.GetTotalNumValues(); + } +}; + +XLA_TEST_P(ExhaustiveC128UnaryTest, Log) { + // TODO(bixia): only test values that are not too big and not too small + // for now and will work on fixing the implementation of XLA + // operations to enable test for other values. + known_incorrect_fn_ = [&](int64 v) { + double f = ConvertValue(v); + return std::fpclassify(f) == FP_NAN || std::abs(f) > 5 || std::abs(f) < 1; + }; + Run(Log, [](complex128 x) { return std::log(x); }); +} + +#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64) +INSTANTIATE_TEST_SUITE_P( + SpecialValues, ExhaustiveC128UnaryTest, + ::testing::Combine( + ::testing::Values(C128), + ::testing::ValuesIn(CreateFpValuesForBoundaryTest()), + ::testing::ValuesIn(CreateFpValuesForBoundaryTest()))); + +INSTANTIATE_TEST_SUITE_P( + SpecialAndNormalValues, ExhaustiveC128UnaryTest, + ::testing::Combine( + ::testing::Values(C128), + ::testing::ValuesIn(CreateFpValuesForBoundaryTest()), + ::testing::Values(GetNormals(10000)))); + +INSTANTIATE_TEST_SUITE_P( + NormalAndSpecialValues, ExhaustiveC128UnaryTest, + ::testing::Combine( + ::testing::Values(C128), ::testing::Values(GetNormals(10000)), + ::testing::ValuesIn(CreateFpValuesForBoundaryTest()))); + +INSTANTIATE_TEST_SUITE_P( + F32NormalAndNormalValues, ExhaustiveC128UnaryTest, + ::testing::Combine(::testing::Values(C128), + ::testing::Values(GetNormals(10000)), + ::testing::Values(GetNormals(10000)))); + +// Tests a total of 40000 ^ 2 inputs, with 2000 ^ 2 inputs in each sub-test, to +// keep the peak memory usage low. +INSTANTIATE_TEST_SUITE_P( + LargeAndSmallMagnituedNormalValues, ExhaustiveC128UnaryTest, + ::testing::Combine( + ::testing::Values(C128), + ::testing::ValuesIn( + GetFpValuesForMagnitudeExtremeNormals(40000, 2000)), + ::testing::ValuesIn( + GetFpValuesForMagnitudeExtremeNormals(40000, 2000)))); +#endif + } // namespace xla From 0d1de81afe7bc662424c796b13bf8971101e259f Mon Sep 17 00:00:00 2001 From: Edward Loper Date: Fri, 19 Jul 2019 12:31:18 -0700 Subject: [PATCH 0177/3053] Fix bug in reduce_join's handling of arg `keepdims` (in TF 1.x). PiperOrigin-RevId: 259017062 --- tensorflow/python/kernel_tests/reduce_join_op_test.py | 10 ++++++++++ tensorflow/python/ops/string_ops.py | 4 +++- tensorflow/tools/api/golden/v1/tensorflow.pbtxt | 2 +- .../tools/api/golden/v1/tensorflow.strings.pbtxt | 2 +- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/kernel_tests/reduce_join_op_test.py b/tensorflow/python/kernel_tests/reduce_join_op_test.py index 49b6620779e..751e3e3648b 100644 --- a/tensorflow/python/kernel_tests/reduce_join_op_test.py +++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py @@ -351,6 +351,16 @@ class ReduceJoinTest(UnicodeTestCase): with self.assertRaisesOpError("reduction dimension 2"): reduced.eval(feed_dict={placeholder.name: 2}) + def testDeprecatedArgs(self): + foobar = constant_op.constant(["foobar"]) + # Old names: keep_dims and reduction_indices + output = string_ops.reduce_join( + ["foo", "bar"], reduction_indices=0, keep_dims=True) + self.assertAllEqual(foobar, output) + # New names keepdims and axis. + output = string_ops.reduce_join(["foo", "bar"], axis=0, keepdims=True) + self.assertAllEqual(foobar, output) + if __name__ == "__main__": test.main() diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py index c27d845db4d..507339b55bb 100644 --- a/tensorflow/python/ops/string_ops.py +++ b/tensorflow/python/ops/string_ops.py @@ -326,13 +326,15 @@ def _reduce_join_reduction_dims(x, axis, reduction_indices): @tf_export(v1=["strings.reduce_join", "reduce_join"]) @deprecation.deprecated_endpoints("reduce_join") def reduce_join(inputs, axis=None, # pylint: disable=missing-docstring - keep_dims=False, + keep_dims=None, separator="", name=None, reduction_indices=None, keepdims=None): keep_dims = deprecation.deprecated_argument_lookup( "keepdims", keepdims, "keep_dims", keep_dims) + if keep_dims is None: + keep_dims = False inputs_t = ops.convert_to_tensor(inputs) reduction_indices = _reduce_join_reduction_dims( inputs_t, axis, reduction_indices) diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt index 90dcb1c4934..32f85a0a66b 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt @@ -1874,7 +1874,7 @@ tf_module { } member_method { name: "reduce_join" - argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'None\', \'None\', \'None\'], " } member_method { name: "reduce_logsumexp" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt index 54e7ce6b5e3..1a73ab6a7e5 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt @@ -26,7 +26,7 @@ tf_module { } member_method { name: "reduce_join" - argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\', \'keepdims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'None\', \'None\', \'None\'], " } member_method { name: "regex_full_match" From 1774e14125d220699b15d05d3c0de7c12211de75 Mon Sep 17 00:00:00 2001 From: Juhyun Lee Date: Fri, 19 Jul 2019 12:37:16 -0700 Subject: [PATCH 0178/3053] TFLite GPU: Transpose weights if DepthwiseConv2DOptions.depth_multiplier > 1. PiperOrigin-RevId: 259018065 --- .../delegates/gpu/common/model_builder.cc | 89 +++++++++++++++++-- 1 file changed, 82 insertions(+), 7 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc index c8c8f8e2657..9a89c0df9b9 100644 --- a/tensorflow/lite/delegates/gpu/common/model_builder.cc +++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc @@ -355,6 +355,12 @@ class ObjectReader { : nullptr; } + TfLiteTensor* GetOutputTensor(int index) const { + return index >= 0 && index < tflite_node_->outputs->size + ? context_->tensors + tflite_node_->outputs->data[index] + : nullptr; + } + private: GraphFloat32* graph_ = nullptr; const TfLiteContext* context_ = nullptr; @@ -780,12 +786,47 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser { RETURN_IF_ERROR( CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1)); RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1)); - TfLiteDepthwiseConvParams* tf_options = nullptr; + TfLiteDepthwiseConvParams* tf_options; RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); RETURN_IF_ERROR(CheckStridesAndDilation( tf_options->stride_height, tf_options->stride_width, tf_options->dilation_height_factor, tf_options->dilation_width_factor)); - return IsActivationSupported(tf_options->activation); + RETURN_IF_ERROR(IsActivationSupported(tf_options->activation)); + + const int depth_multiplier = tf_options->depth_multiplier; + const auto* input = context->tensors + tflite_node->inputs->data[0]; + const auto* filter = context->tensors + tflite_node->inputs->data[1]; + const auto* bias = tflite_node->inputs->size > 2 + ? context->tensors + tflite_node->inputs->data[2] + : nullptr; + const auto* output = context->tensors + tflite_node->outputs->data[0]; + if (!input->dims || input->dims->size != 4) { + return InvalidArgumentError("input.dims.size != 4"); + } + if (!filter->dims || filter->dims->size != 4) { + return InvalidArgumentError("filter.dims.size != 4"); + } + if (!output->dims || output->dims->size != 4) { + return InvalidArgumentError("output.dims.size != 4"); + } + if (input->dims->data[0] != output->dims->data[0]) { + return InvalidArgumentError("input.b != output.b"); + } + const int input_depth = input->dims->data[3]; + const int output_depth = output->dims->data[3]; + if (filter->dims->data[3] != output_depth) { + return InvalidArgumentError("filter.i != output.c"); + } + if (output_depth != input_depth * depth_multiplier) { + return InvalidArgumentError("output.c != input.c * depth_multiplier"); + } + if (bias && NumElements(bias) != output_depth) { + return InvalidArgumentError("bias.size != output.c"); + } + if (depth_multiplier != 1 && input_depth != 1) { + return UnimplementedError("depth_multiplier != 1 && input.c != 1"); + } + return OkStatus(); } Status Parse(const TfLiteNode* tflite_node, @@ -799,11 +840,8 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser { DepthwiseConvolution2DAttributes attr; RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights)); reader->ReadTensor(2, &attr.bias).IgnoreError(); // bias is optional - const auto* tf_options = reinterpret_cast( - tflite_node->builtin_data); - if (!tf_options) { - return InternalError("Missing tflite params"); - } + TfLiteDepthwiseConvParams* tf_options; + RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width); attr.dilations = HW(std::max(1, tf_options->dilation_height_factor), std::max(1, tf_options->dilation_width_factor)); @@ -811,9 +849,46 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser { graph->FindInputs(node->id)[0]->tensor.shape, &attr); RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation, graph, node)); + const int depth_multiplier = tf_options->depth_multiplier; + if (depth_multiplier != 1) { + const TfLiteTensor* input = reader->GetInputTensor(0); + const TfLiteTensor* filter = reader->GetInputTensor(1); + const TfLiteTensor* output = reader->GetOutputTensor(0); + TransposeWeights(input, filter, output, depth_multiplier, &attr); + } node->operation.attributes = std::move(attr); return OkStatus(); } + + private: + // TFLite CPU stores weights as: + // [1, kernel_height, kernel_width, input_depth * depth_multiplier] + // TFLite GPU stores weights as: + // [depth_multiplier, kernel_height, kernel_width, input_depth] + static void TransposeWeights(const TfLiteTensor* input, + const TfLiteTensor* filter, + const TfLiteTensor* output, int depth_multiplier, + DepthwiseConvolution2DAttributes* attr) { + const int input_depth = input->dims->data[3]; + const int filter_height = filter->dims->data[1]; + const int filter_width = filter->dims->data[2]; + const int output_depth = output->dims->data[3]; + Tensor weights; + weights.id = attr->weights.id; + weights.shape = + OHWI(output_depth, filter_height, filter_width, input_depth); + weights.data.resize(weights.shape.DimensionsProduct()); + float* dst = &weights.data[0]; + for (int j = 0; j < output_depth; ++j) { + const float* src = attr->weights.data.data() + j; + for (int i = 0; i < filter_height * filter_width; ++i) { + *dst = *src; + dst++; + src += output_depth; + } + } + attr->weights = std::move(weights); + } }; class HardSwishOperationParser : public TFLiteOperationParser { From b63c4b28025c9c7a70cd94b6673d496c63b33c8e Mon Sep 17 00:00:00 2001 From: Igor Saprykin Date: Fri, 19 Jul 2019 12:51:51 -0700 Subject: [PATCH 0179/3053] Skip the test that triggers the issue # 137776821 in Eager mode rather than when run_distributed=True, because it still fails. PiperOrigin-RevId: 259020743 --- .../python/keras/distribute/distribute_strategy_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py index b01bcec6bff..f20fa0b1144 100644 --- a/tensorflow/python/keras/distribute/distribute_strategy_test.py +++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py @@ -887,8 +887,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase, combinations.times(strategy_minus_tpu_combinations(), combinations.combine(run_distributed=[True, False]))) def test_on_dataset_with_unknown_cardinality_without_steps( - self, distribution, run_distributed): - if run_distributed: + self, distribution, run_distributed, mode): + if mode == 'eager': self.skipTest('b/137776821 : Fails with -c opt=-undebug') with self.cached_session(): with distribution.scope(): From 98f7f92e25588fb1bca1405531f91d8bde1f0ed4 Mon Sep 17 00:00:00 2001 From: Igor Saprykin Date: Fri, 19 Jul 2019 13:07:41 -0700 Subject: [PATCH 0180/3053] Run dist-strat save model tests with run_distribute=True. It's a good idea to cover these tests because they failed with run_distribute=True in the past. `run_distribute` is a temporary flag for launching the new code path in Keras. PiperOrigin-RevId: 259023542 --- tensorflow/python/distribute/BUILD | 1 + .../keras_experimental_saved_model_test.py | 15 +++++++----- .../python/distribute/keras_save_load_test.py | 16 ++++++++----- .../model_collection/simple_models.py | 24 ++++++++++++++++--- .../distribute/saved_model_mixed_api_test.py | 15 +++++++----- .../distribute/saved_model_save_load_test.py | 15 +++++++----- .../distribute/saved_model_test_base.py | 22 ++++++++++------- 7 files changed, 73 insertions(+), 35 deletions(-) diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD index 21831fcd891..6a9f63c290d 100644 --- a/tensorflow/python/distribute/BUILD +++ b/tensorflow/python/distribute/BUILD @@ -1134,6 +1134,7 @@ distribute_py_test( size = "medium", srcs = ["keras_save_load_test.py"], main = "keras_save_load_test.py", + shard_count = 3, deps = [ ":saved_model_test_base", "//tensorflow/python/keras:saving", diff --git a/tensorflow/python/distribute/keras_experimental_saved_model_test.py b/tensorflow/python/distribute/keras_experimental_saved_model_test.py index 0bfb3419cc2..0a0a57ffe33 100644 --- a/tensorflow/python/distribute/keras_experimental_saved_model_test.py +++ b/tensorflow/python/distribute/keras_experimental_saved_model_test.py @@ -41,17 +41,20 @@ class KerasExperimentalSaveLoadTest(test_base.TestSavedModelBase): @combinations.generate(test_base.simple_models_with_strategies()) def test_save_no_strategy_restore_strategy(self, model_and_input, - distribution): + distribution, run_distributed): self.run_test_save_no_strategy_restore_strategy(model_and_input, - distribution) + distribution, + run_distributed) @combinations.generate( combinations.times(test_base.simple_models_with_strategies(), combinations.combine(save_in_scope=[True, False]))) def test_save_strategy_restore_no_strategy(self, model_and_input, - distribution, save_in_scope): + distribution, save_in_scope, + run_distributed): self.run_test_save_strategy_restore_no_strategy(model_and_input, - distribution, save_in_scope) + distribution, save_in_scope, + run_distributed) @combinations.generate( combinations.times(test_base.simple_models_with_strategy_pairs(), @@ -59,11 +62,11 @@ class KerasExperimentalSaveLoadTest(test_base.TestSavedModelBase): def test_save_strategy_restore_strategy(self, model_and_input, distribution_for_saving, distribution_for_restoring, - save_in_scope): + save_in_scope, run_distributed): self.run_test_save_strategy_restore_strategy(model_and_input, distribution_for_saving, distribution_for_restoring, - save_in_scope) + save_in_scope, run_distributed) if __name__ == '__main__': diff --git a/tensorflow/python/distribute/keras_save_load_test.py b/tensorflow/python/distribute/keras_save_load_test.py index e001ae43814..fcb4941688d 100644 --- a/tensorflow/python/distribute/keras_save_load_test.py +++ b/tensorflow/python/distribute/keras_save_load_test.py @@ -41,20 +41,23 @@ class KerasSaveLoadTest(test_base.TestSavedModelBase): @combinations.generate(test_base.simple_models_with_strategies()) def test_save_no_strategy_restore_strategy(self, model_and_input, - distribution): + distribution, run_distributed): self.run_test_save_no_strategy_restore_strategy(model_and_input, - distribution) + distribution, + run_distributed) @combinations.generate( combinations.times(test_base.simple_models_with_strategies(), combinations.combine(save_in_scope=[True, False]))) def test_save_strategy_restore_no_strategy(self, model_and_input, - distribution, save_in_scope): + distribution, save_in_scope, + run_distributed): if save_in_scope: self.skipTest(('b/134703272 - Saving model in tf.distribute.Strategy ', 'scope is not supported.')) self.run_test_save_strategy_restore_no_strategy(model_and_input, - distribution, save_in_scope) + distribution, save_in_scope, + run_distributed) @combinations.generate( combinations.times(test_base.simple_models_with_strategy_pairs(), @@ -62,14 +65,15 @@ class KerasSaveLoadTest(test_base.TestSavedModelBase): def test_save_strategy_restore_strategy(self, model_and_input, distribution_for_saving, distribution_for_restoring, - save_in_scope): + save_in_scope, run_distributed): if save_in_scope: self.skipTest(('b/134703272 - Saving model in tf.distribute.Strategy ', 'scope is not supported.')) self.run_test_save_strategy_restore_strategy(model_and_input, distribution_for_saving, distribution_for_restoring, - save_in_scope) + save_in_scope, run_distributed) + if __name__ == '__main__': test.main() diff --git a/tensorflow/python/distribute/model_collection/simple_models.py b/tensorflow/python/distribute/model_collection/simple_models.py index d3b811bebc8..5dd5fc27c42 100644 --- a/tensorflow/python/distribute/model_collection/simple_models.py +++ b/tensorflow/python/distribute/model_collection/simple_models.py @@ -49,7 +49,13 @@ class SimpleFunctionalModel(model_collection_base.ModelAndInput): model = keras.Model(inputs=x, outputs=y) optimizer = gradient_descent.SGD(learning_rate=0.001) - model.compile(loss='mse', metrics=['mae'], optimizer=optimizer) + run_distributed = kwargs.pop('run_distributed', None) + assert run_distributed is not None + model.compile( + loss='mse', + metrics=['mae'], + optimizer=optimizer, + run_distributed=run_distributed) return model, output_name @@ -71,7 +77,13 @@ class SimpleSequentialModel(model_collection_base.ModelAndInput): 5, dtype=dtypes.float32, name=output_name, input_dim=3) model.add(y) optimizer = gradient_descent.SGD(learning_rate=0.001) - model.compile(loss='mse', metrics=['mae'], optimizer=optimizer) + run_distributed = kwargs.pop('run_distributed', None) + assert run_distributed is not None + model.compile( + loss='mse', + metrics=['mae'], + optimizer=optimizer, + run_distributed=run_distributed) return model, output_name @@ -100,8 +112,14 @@ class SimpleSubclassModel(model_collection_base.ModelAndInput): def get_model(self, **kwargs): model = _SimpleModel() optimizer = gradient_descent.SGD(learning_rate=0.001) + run_distributed = kwargs.pop('run_distributed', None) + assert run_distributed is not None model.compile( - loss='mse', metrics=['mae'], cloning=False, optimizer=optimizer) + loss='mse', + metrics=['mae'], + cloning=False, + optimizer=optimizer, + run_distributed=run_distributed) return model, model.output_name diff --git a/tensorflow/python/distribute/saved_model_mixed_api_test.py b/tensorflow/python/distribute/saved_model_mixed_api_test.py index 7179987b212..834cfbbabeb 100644 --- a/tensorflow/python/distribute/saved_model_mixed_api_test.py +++ b/tensorflow/python/distribute/saved_model_mixed_api_test.py @@ -49,20 +49,23 @@ class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase): @combinations.generate(test_base.simple_models_with_strategies()) def test_save_no_strategy_restore_strategy(self, model_and_input, - distribution): + distribution, run_distributed): self.run_test_save_no_strategy_restore_strategy(model_and_input, - distribution) + distribution, + run_distributed) @combinations.generate( combinations.times(test_base.simple_models_with_strategies(), combinations.combine(save_in_scope=[True, False]))) def test_save_strategy_restore_no_strategy(self, model_and_input, - distribution, save_in_scope): + distribution, save_in_scope, + run_distributed): if save_in_scope: self.skipTest(('Saving model within tf.distribute.Strategy scope is not ', 'supported.')) self.run_test_save_strategy_restore_no_strategy(model_and_input, - distribution, save_in_scope) + distribution, save_in_scope, + run_distributed) @combinations.generate( combinations.times(test_base.simple_models_with_strategy_pairs(), @@ -70,14 +73,14 @@ class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase): def test_save_strategy_restore_strategy(self, model_and_input, distribution_for_saving, distribution_for_restoring, - save_in_scope): + save_in_scope, run_distributed): if save_in_scope: self.skipTest(('Saving model within tf.distribute.Strategy scope is not ', 'supported.')) self.run_test_save_strategy_restore_strategy(model_and_input, distribution_for_saving, distribution_for_restoring, - save_in_scope) + save_in_scope, run_distributed) if __name__ == '__main__': diff --git a/tensorflow/python/distribute/saved_model_save_load_test.py b/tensorflow/python/distribute/saved_model_save_load_test.py index 144ffdbbcc6..6c0b2463de4 100644 --- a/tensorflow/python/distribute/saved_model_save_load_test.py +++ b/tensorflow/python/distribute/saved_model_save_load_test.py @@ -41,20 +41,23 @@ class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase): @combinations.generate(test_base.simple_models_with_strategies()) def test_save_no_strategy_restore_strategy(self, model_and_input, - distribution): + distribution, run_distributed): self.run_test_save_no_strategy_restore_strategy(model_and_input, - distribution) + distribution, + run_distributed) @combinations.generate( combinations.times(test_base.simple_models_with_strategies(), combinations.combine(save_in_scope=[True, False]))) def test_save_strategy_restore_no_strategy(self, model_and_input, - distribution, save_in_scope): + distribution, save_in_scope, + run_distributed): if save_in_scope: self.skipTest(('Saving model within tf.distribute.Strategy scope is not ', 'supported.')) self.run_test_save_strategy_restore_no_strategy(model_and_input, - distribution, save_in_scope) + distribution, save_in_scope, + run_distributed) @combinations.generate( combinations.times(test_base.simple_models_with_strategy_pairs(), @@ -62,14 +65,14 @@ class SavedModelSaveAndLoadTest(test_base.TestSavedModelBase): def test_save_strategy_restore_strategy(self, model_and_input, distribution_for_saving, distribution_for_restoring, - save_in_scope): + save_in_scope, run_distributed): if save_in_scope: self.skipTest(('Saving model within tf.distribute.Strategy scope is not ', 'supported.')) self.run_test_save_strategy_restore_strategy(model_and_input, distribution_for_saving, distribution_for_restoring, - save_in_scope) + save_in_scope, run_distributed) if __name__ == '__main__': diff --git a/tensorflow/python/distribute/saved_model_test_base.py b/tensorflow/python/distribute/saved_model_test_base.py index 11f35b76f91..c17c0e3ef49 100644 --- a/tensorflow/python/distribute/saved_model_test_base.py +++ b/tensorflow/python/distribute/saved_model_test_base.py @@ -62,7 +62,8 @@ def simple_models_with_strategies(): return combinations.combine( model_and_input=simple_models, distribution=strategies_minus_tpu, - mode=['eager']) + mode=['eager'], + run_distributed=[True, False]) def simple_models_with_strategy_pairs(): @@ -70,7 +71,8 @@ def simple_models_with_strategy_pairs(): model_and_input=simple_models, distribution_for_saving=strategies_minus_tpu, distribution_for_restoring=strategies_minus_tpu, - mode=['eager']) + mode=['eager'], + run_distributed=[True, False]) def load_and_run_with_saved_model_api(distribution, saved_dir, predict_dataset, @@ -149,13 +151,14 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase): return predict_dataset def run_test_save_no_strategy_restore_strategy(self, model_and_input, - distribution): + distribution, run_distributed): """Save a model without DS, and restore it with DS.""" saved_dir = os.path.join(self.get_temp_dir(), self._root_dir, 'test_save_no_dist_restore_dist') - model, output_name = model_and_input.get_model() + model, output_name = model_and_input.get_model( + run_distributed=run_distributed) x_train, y_train, x_predict = model_and_input.get_data() batch_size = model_and_input.get_batch_size() @@ -175,14 +178,16 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase): self.assertAllClose(result_before_save, result_after_save, atol=_TOLERANCE) def run_test_save_strategy_restore_no_strategy(self, model_and_input, - distribution, save_in_scope): + distribution, save_in_scope, + run_distributed): """Save a model with DS, and restore it without DS.""" saved_dir = os.path.join(self.get_temp_dir(), self._root_dir, 'test_save_no_dist_restore_dist') with distribution.scope(): - model, output_name = model_and_input.get_model() + model, output_name = model_and_input.get_model( + run_distributed=run_distributed) x_train, y_train, x_predict = model_and_input.get_data() batch_size = model_and_input.get_batch_size() @@ -207,14 +212,15 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase): def run_test_save_strategy_restore_strategy(self, model_and_input, distribution_for_saving, distribution_for_restoring, - save_in_scope): + save_in_scope, run_distributed): """Save a model with DS, and restore it with potentially different DS.""" saved_dir = os.path.join(self.get_temp_dir(), self._root_dir, 'test_save_dist_restore_dist') with distribution_for_saving.scope(): - model, output_name = model_and_input.get_model() + model, output_name = model_and_input.get_model( + run_distributed=run_distributed) x_train, y_train, x_predict = model_and_input.get_data() batch_size = model_and_input.get_batch_size() From f811d8a2f7c24e70a7fb73475b639839abe136b4 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Fri, 19 Jul 2019 13:13:30 -0700 Subject: [PATCH 0181/3053] Refactor AssertNextDatasetOp --- .../core/kernels/data/experimental/BUILD | 21 ++ .../experimental/assert_next_dataset_op.cc | 223 +++++++++--------- .../experimental/assert_next_dataset_op.h | 49 ++++ 3 files changed, 180 insertions(+), 113 deletions(-) create mode 100644 tensorflow/core/kernels/data/experimental/assert_next_dataset_op.h diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD index d16f580d1c5..cd27ca357e6 100644 --- a/tensorflow/core/kernels/data/experimental/BUILD +++ b/tensorflow/core/kernels/data/experimental/BUILD @@ -3,6 +3,7 @@ load( "//tensorflow:tensorflow.bzl", + "tf_cc_test", "tf_kernel_library", ) @@ -16,9 +17,29 @@ exports_files(["LICENSE"]) tf_kernel_library( name = "assert_next_dataset_op", srcs = ["assert_next_dataset_op.cc"], + hdrs = ["assert_next_dataset_op.h"], deps = [ "//tensorflow/core:experimental_dataset_ops_op_lib", "//tensorflow/core:framework", + "//tensorflow/core/kernels/data:name_utils", + "//third_party/eigen3", + ], +) + +tf_cc_test( + name = "assert_next_dataset_op_test", + size = "small", + srcs = ["assert_next_dataset_op_test.cc"], + deps = [ + ":assert_next_dataset_op", + "//tensorflow/core:experimental_dataset_ops_op_lib", + "//tensorflow/core:framework", + "//tensorflow/core/kernels/data:dataset_test_base", + "//tensorflow/core/kernels/data:range_dataset_op", + "//tensorflow/core/kernels/data:take_dataset_op", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", "//third_party/eigen3", ], ) diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc index b84d813c023..592d8db8281 100644 --- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc +++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc @@ -12,149 +12,146 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "tensorflow/core/kernels/data/experimental/assert_next_dataset_op.h" + #include #include "tensorflow/core/framework/dataset.h" #include "tensorflow/core/framework/partial_tensor_shape.h" #include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/kernels/data/name_utils.h" namespace tensorflow { namespace data { -namespace { -// See documentation in ../ops/dataset_ops.cc for a high-level -// description of the following op. -class AssertNextDatasetOp : public UnaryDatasetOpKernel { +/* static */ constexpr const char* const AssertNextDatasetOp::kInputDataset; +/* static */ constexpr const char* const AssertNextDatasetOp::kDatasetType; +/* static */ constexpr const char* const AssertNextDatasetOp::kTransformations; +/* static */ constexpr const char* const AssertNextDatasetOp::kOutputTypes; +/* static */ constexpr const char* const AssertNextDatasetOp::kOutputShapes; + +class AssertNextDatasetOp::Dataset : public DatasetBase { public: - explicit AssertNextDatasetOp(OpKernelConstruction* ctx) - : UnaryDatasetOpKernel(ctx) { - OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_)); - OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_)); + Dataset(OpKernelContext* ctx, const DatasetBase* input, + const std::vector& transformations, + const DataTypeVector& output_types, + const std::vector& output_shapes) + : DatasetBase(DatasetContext(ctx)), + input_(input), + transformations_(transformations), + output_types_(output_types), + output_shapes_(output_shapes) { + input_->Ref(); } + ~Dataset() override { input_->Unref(); } + + std::unique_ptr MakeIteratorInternal( + const string& prefix) const override { + return absl::make_unique(Iterator::Params{ + this, name_utils::IteratorPrefix(kDatasetType, prefix)}); + } + + const DataTypeVector& output_dtypes() const override { return output_types_; } + const std::vector& output_shapes() const override { + return output_shapes_; + } + + string DebugString() const override { + return name_utils::DatasetDebugString(kDatasetType); + } + + int64 Cardinality() const override { return input_->Cardinality(); } + protected: - void MakeDataset(OpKernelContext* ctx, DatasetBase* input, - DatasetBase** output) override { - std::vector transformations; - OP_REQUIRES_OK(ctx, ParseVectorArgument(ctx, "transformations", - &transformations)); - *output = - new Dataset(ctx, input, transformations, output_types_, output_shapes_); + Status AsGraphDefInternal(SerializationContext* ctx, + DatasetGraphDefBuilder* b, + Node** output) const override { + Node* input_graph_node = nullptr; + TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node)); + Node* transformations_node = nullptr; + TF_RETURN_IF_ERROR(b->AddVector(transformations_, &transformations_node)); + TF_RETURN_IF_ERROR( + b->AddDataset(this, {input_graph_node, transformations_node}, output)); + return Status::OK(); } private: - class Dataset : public DatasetBase { + class Iterator : public DatasetIterator { public: - Dataset(OpKernelContext* ctx, const DatasetBase* input, - const std::vector& transformations, - const DataTypeVector& output_types, - const std::vector& output_shapes) - : DatasetBase(DatasetContext(ctx)), - input_(input), - transformations_(transformations), - output_types_(output_types), - output_shapes_(output_shapes) { - input_->Ref(); + explicit Iterator(const Params& params) + : DatasetIterator(params) {} + + Status Initialize(IteratorContext* ctx) override { + std::vector tokens = + absl::StrSplit(prefix(), ':', absl::SkipEmpty()); + if (dataset()->transformations_.size() > tokens.size() - 2) { + return errors::InvalidArgument( + "Asserted next ", dataset()->transformations_.size(), + " transformations but encountered only ", tokens.size() - 2, "."); + } + int n = tokens.size(); + for (size_t i = 0; i < dataset()->transformations_.size(); ++i) { + if (dataset()->transformations_[i] != tokens[n - 2 - i]) { + return errors::InvalidArgument( + "Asserted ", dataset()->transformations_[i], + " transformation at offset ", i, " but encountered ", + tokens[n - 2 - i], " transformation instead."); + } + } + return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_); } - ~Dataset() override { input_->Unref(); } - - std::unique_ptr MakeIteratorInternal( - const string& prefix) const override { - return absl::make_unique( - Iterator::Params{this, strings::StrCat(prefix, "::AssertNext")}); + Status GetNextInternal(IteratorContext* ctx, + std::vector* out_tensors, + bool* end_of_sequence) override { + return input_impl_->GetNext(ctx, out_tensors, end_of_sequence); } - const DataTypeVector& output_dtypes() const override { - return output_types_; - } - const std::vector& output_shapes() const override { - return output_shapes_; - } - - string DebugString() const override { - return "AssertNextDatasetOp::Dataset"; - } - - int64 Cardinality() const override { return input_->Cardinality(); } - protected: - Status AsGraphDefInternal(SerializationContext* ctx, - DatasetGraphDefBuilder* b, - Node** output) const override { - Node* input_graph_node = nullptr; - TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node)); - Node* transformations_node = nullptr; - TF_RETURN_IF_ERROR(b->AddVector(transformations_, &transformations_node)); - TF_RETURN_IF_ERROR(b->AddDataset( - this, {input_graph_node, transformations_node}, output)); + std::shared_ptr CreateNode( + IteratorContext* ctx, model::Node::Args args) const override { + return model::MakeKnownRatioNode(std::move(args), + /*ratio=*/1); + } + + Status SaveInternal(IteratorStateWriter* writer) override { + TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_)); + return Status::OK(); + } + + Status RestoreInternal(IteratorContext* ctx, + IteratorStateReader* reader) override { + TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_)); return Status::OK(); } private: - class Iterator : public DatasetIterator { - public: - explicit Iterator(const Params& params) - : DatasetIterator(params) {} - - Status Initialize(IteratorContext* ctx) override { - std::vector tokens = - absl::StrSplit(prefix(), ':', absl::SkipEmpty()); - if (dataset()->transformations_.size() > tokens.size() - 2) { - return errors::InvalidArgument( - "Asserted next ", dataset()->transformations_.size(), - " transformations but encountered only ", tokens.size() - 2, "."); - } - int n = tokens.size(); - for (size_t i = 0; i < dataset()->transformations_.size(); ++i) { - if (dataset()->transformations_[i] != tokens[n - 2 - i]) { - return errors::InvalidArgument( - "Asserted ", dataset()->transformations_[i], - " transformation at offset ", i, " but encountered ", - tokens[n - 2 - i], " transformation instead."); - } - } - return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_); - } - - Status GetNextInternal(IteratorContext* ctx, - std::vector* out_tensors, - bool* end_of_sequence) override { - return input_impl_->GetNext(ctx, out_tensors, end_of_sequence); - } - - protected: - std::shared_ptr CreateNode( - IteratorContext* ctx, model::Node::Args args) const override { - return model::MakeKnownRatioNode(std::move(args), - /*ratio=*/1); - } - - Status SaveInternal(IteratorStateWriter* writer) override { - TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_)); - return Status::OK(); - } - - Status RestoreInternal(IteratorContext* ctx, - IteratorStateReader* reader) override { - TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_)); - return Status::OK(); - } - - private: - std::unique_ptr input_impl_; - }; - - const DatasetBase* input_; - const std::vector transformations_; - const DataTypeVector output_types_; - const std::vector output_shapes_; + std::unique_ptr input_impl_; }; - DataTypeVector output_types_; - std::vector output_shapes_; + const DatasetBase* input_; + const std::vector transformations_; + const DataTypeVector output_types_; + const std::vector output_shapes_; }; +AssertNextDatasetOp::AssertNextDatasetOp(OpKernelConstruction* ctx) + : UnaryDatasetOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_)); + OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_)); +} + +void AssertNextDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input, + DatasetBase** output) { + std::vector transformations; + OP_REQUIRES_OK(ctx, ParseVectorArgument(ctx, kTransformations, + &transformations)); + *output = + new Dataset(ctx, input, transformations, output_types_, output_shapes_); +} + +namespace { REGISTER_KERNEL_BUILDER(Name("AssertNextDataset").Device(DEVICE_CPU), AssertNextDatasetOp); REGISTER_KERNEL_BUILDER( diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.h b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.h new file mode 100644 index 00000000000..aae2e80323e --- /dev/null +++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.h @@ -0,0 +1,49 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_ASSERT_NEXT_DATASET_OP_H_ +#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_ASSERT_NEXT_DATASET_OP_H_ + +#include "tensorflow/core/framework/dataset.h" + +namespace tensorflow { +namespace data { + +// See documentation in ../../ops/experimental_dataset_ops.cc for a high-level +// description of the following op. + +class AssertNextDatasetOp : public UnaryDatasetOpKernel { + public: + static constexpr const char* const kDatasetType = "AssertNext"; + static constexpr const char* const kInputDataset = "input_dataset"; + static constexpr const char* const kTransformations = "transformations"; + static constexpr const char* const kOutputTypes = "output_types"; + static constexpr const char* const kOutputShapes = "output_shapes"; + + explicit AssertNextDatasetOp(OpKernelConstruction* ctx); + + protected: + void MakeDataset(OpKernelContext* ctx, DatasetBase* input, + DatasetBase** output) override; + + private: + class Dataset; + DataTypeVector output_types_; + std::vector output_shapes_; +}; + +} // namespace data +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_ASSERT_NEXT_DATASET_OP_H_ From 445af53eb0202f3298967a3c0501e61bf8555709 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Fri, 19 Jul 2019 13:19:13 -0700 Subject: [PATCH 0182/3053] Move some shared functions to DatasetOpsTestBase --- tensorflow/core/kernels/data/BUILD | 1 + .../core/kernels/data/dataset_test_base.cc | 40 +++++++++++++++++++ .../core/kernels/data/dataset_test_base.h | 14 +++++++ .../kernels/data/optimize_dataset_op_test.cc | 39 ------------------ 4 files changed, 55 insertions(+), 39 deletions(-) diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD index 83252bfcbd8..a5f41b6dcae 100644 --- a/tensorflow/core/kernels/data/BUILD +++ b/tensorflow/core/kernels/data/BUILD @@ -30,6 +30,7 @@ cc_library( ":iterator_ops", ":name_utils", ":range_dataset_op", + ":take_dataset_op", "//tensorflow/core:core_cpu", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc index 8c9d775444f..2a5f03edf16 100644 --- a/tensorflow/core/kernels/data/dataset_test_base.cc +++ b/tensorflow/core/kernels/data/dataset_test_base.cc @@ -274,6 +274,46 @@ Status DatasetOpsTestBase::CreateTensorSliceDataset( return Status::OK(); } +// Create a `RangeDataset` dataset as a variant tensor. +Status DatasetOpsTestBase::MakeRangeDataset( + const Tensor& start, const Tensor& stop, const Tensor& step, + const DataTypeVector& output_types, + const std::vector& output_shapes, + Tensor* range_dataset) { + GraphConstructorOptions graph_opts; + graph_opts.allow_internal_ops = true; + graph_opts.expect_device_spec = false; + TF_RETURN_IF_ERROR( + RunFunction(test::function::MakeRangeDataset(), + /*attrs*/ + {{RangeDatasetOp::kOutputTypes, output_types}, + {RangeDatasetOp::kOutputShapes, output_shapes}}, + /*inputs*/ {start, stop, step}, graph_opts, + /*rets*/ {range_dataset})); + return Status::OK(); +} + +// Create a `TakeDataset` dataset as a variant tensor. +Status DatasetOpsTestBase::MakeTakeDataset( + const Tensor& input_dataset, int64 count, + const DataTypeVector& output_types, + const std::vector& output_shapes, + Tensor* take_dataset) { + GraphConstructorOptions graph_opts; + graph_opts.allow_internal_ops = true; + graph_opts.expect_device_spec = false; + + Tensor count_tensor = CreateTensor(TensorShape({}), {count}); + TF_RETURN_IF_ERROR( + RunFunction(test::function::MakeTakeDataset(), + /*attrs*/ + {{TakeDatasetOp::kOutputTypes, output_types}, + {TakeDatasetOp::kOutputShapes, output_shapes}}, + /*inputs*/ {input_dataset, count_tensor}, graph_opts, + /*rets*/ {take_dataset})); + return Status::OK(); +} + Status DatasetOpsTestBase::CreateOpKernel( const NodeDef& node_def, std::unique_ptr* op_kernel) { OpKernel* kernel; diff --git a/tensorflow/core/kernels/data/dataset_test_base.h b/tensorflow/core/kernels/data/dataset_test_base.h index 75a221e2782..427cccac9f9 100644 --- a/tensorflow/core/kernels/data/dataset_test_base.h +++ b/tensorflow/core/kernels/data/dataset_test_base.h @@ -33,6 +33,7 @@ limitations under the License. #include "tensorflow/core/kernels/data/iterator_ops.h" #include "tensorflow/core/kernels/data/name_utils.h" #include "tensorflow/core/kernels/data/range_dataset_op.h" +#include "tensorflow/core/kernels/data/take_dataset_op.h" #include "tensorflow/core/kernels/ops_testutil.h" #include "tensorflow/core/lib/io/zlib_compression_options.h" #include "tensorflow/core/lib/io/zlib_outputbuffer.h" @@ -177,6 +178,19 @@ class DatasetOpsTestBase : public ::testing::Test { std::vector* const components, DatasetBase** tensor_slice_dataset); + // Creates a `RangeDataset` dataset as a variant tensor. + Status MakeRangeDataset(const Tensor& start, const Tensor& stop, + const Tensor& step, + const DataTypeVector& output_types, + const std::vector& output_shapes, + Tensor* range_dataset); + + // Creates a `TakeDataset` dataset as a variant tensor. + Status MakeTakeDataset(const Tensor& input_dataset, int64 count, + const DataTypeVector& output_types, + const std::vector& output_shapes, + Tensor* take_dataset); + // Fetches the dataset from the operation context. Status GetDatasetFromContext(OpKernelContext* context, int output_index, DatasetBase** const dataset); diff --git a/tensorflow/core/kernels/data/optimize_dataset_op_test.cc b/tensorflow/core/kernels/data/optimize_dataset_op_test.cc index 94dda91dbef..4469c6eebf7 100644 --- a/tensorflow/core/kernels/data/optimize_dataset_op_test.cc +++ b/tensorflow/core/kernels/data/optimize_dataset_op_test.cc @@ -50,45 +50,6 @@ class OptimizeDatasetOpTest : public DatasetOpsTestBase { TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context)); return Status::OK(); } - - // Create a `RangeDataset` dataset as a variant tensor. - Status MakeRangeDataset(const Tensor& start, const Tensor& stop, - const Tensor& step, - const DataTypeVector& output_types, - const std::vector& output_shapes, - Tensor* range_dataset) { - GraphConstructorOptions graph_opts; - graph_opts.allow_internal_ops = true; - graph_opts.expect_device_spec = false; - TF_RETURN_IF_ERROR( - RunFunction(test::function::MakeRangeDataset(), - /*attrs*/ - {{RangeDatasetOp::kOutputTypes, output_types}, - {RangeDatasetOp::kOutputShapes, output_shapes}}, - /*inputs*/ {start, stop, step}, graph_opts, - /*rets*/ {range_dataset})); - return Status::OK(); - } - - // Create a `TakeDataset` dataset as a variant tensor. - Status MakeTakeDataset(const Tensor& input_dataset, int64 count, - const DataTypeVector& output_types, - const std::vector& output_shapes, - Tensor* take_dataset) { - GraphConstructorOptions graph_opts; - graph_opts.allow_internal_ops = true; - graph_opts.expect_device_spec = false; - - Tensor count_tensor = CreateTensor(TensorShape({}), {count}); - TF_RETURN_IF_ERROR( - RunFunction(test::function::MakeTakeDataset(), - /*attrs*/ - {{TakeDatasetOp::kOutputTypes, output_types}, - {TakeDatasetOp::kOutputShapes, output_shapes}}, - /*inputs*/ {input_dataset, count_tensor}, graph_opts, - /*rets*/ {take_dataset})); - return Status::OK(); - } }; TEST_F(OptimizeDatasetOpTest, NoopElimination) { From 7eea49660a365070b59cfdf422563d4059b9db72 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Fri, 19 Jul 2019 13:22:12 -0700 Subject: [PATCH 0183/3053] Tests for AssertNextDatasetOp --- .../core/kernels/data/experimental/BUILD | 4 +- .../assert_next_dataset_op_test.cc | 667 ++++++++++++++++++ 2 files changed, 668 insertions(+), 3 deletions(-) create mode 100644 tensorflow/core/kernels/data/experimental/assert_next_dataset_op_test.cc diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD index cd27ca357e6..2ff370e92a6 100644 --- a/tensorflow/core/kernels/data/experimental/BUILD +++ b/tensorflow/core/kernels/data/experimental/BUILD @@ -34,12 +34,10 @@ tf_cc_test( ":assert_next_dataset_op", "//tensorflow/core:experimental_dataset_ops_op_lib", "//tensorflow/core:framework", - "//tensorflow/core/kernels/data:dataset_test_base", - "//tensorflow/core/kernels/data:range_dataset_op", - "//tensorflow/core/kernels/data:take_dataset_op", "//tensorflow/core:test", "//tensorflow/core:test_main", "//tensorflow/core:testlib", + "//tensorflow/core/kernels/data:dataset_test_base", "//third_party/eigen3", ], ) diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op_test.cc new file mode 100644 index 00000000000..e256d5ba008 --- /dev/null +++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op_test.cc @@ -0,0 +1,667 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/core/kernels/data/experimental/assert_next_dataset_op.h" + +#include "tensorflow/core/kernels/data/dataset_test_base.h" + +namespace tensorflow { +namespace data { +namespace { + +constexpr char kNodeName[] = "assert_next_dataset"; + +struct RangeDatasetParams { + int start; + int stop; + int step; +}; + +struct TakeDatasetParams { + int count; +}; + +class AssertNextDatasetOpTest : public DatasetOpsTestBase { + protected: + // Creates a new `AssertNextDataset` op kernel. + Status CreateAssertNextDatasetOpKernel( + const DataTypeVector& output_types, + const std::vector& output_shapes, + std::unique_ptr* assert_next_dataset_op_kernel) { + NodeDef node_def = test::function::NDef( + kNodeName, name_utils::OpName(AssertNextDatasetOp::kDatasetType), + {AssertNextDatasetOp::kInputDataset, + AssertNextDatasetOp::kTransformations}, + {{AssertNextDatasetOp::kOutputTypes, output_types}, + {AssertNextDatasetOp::kOutputShapes, output_shapes}}); + TF_RETURN_IF_ERROR(CreateOpKernel(node_def, assert_next_dataset_op_kernel)); + return Status::OK(); + } + + // Creates a new `AssertNextDataset` op kernel context. + Status CreateAssertNextDatasetContext( + OpKernel* const op_kernel, + gtl::InlinedVector* const inputs, + std::unique_ptr* context) { + TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs)); + TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context)); + return Status::OK(); + } + + // Creates a new `RangeAndTakeDataset` tensor. + Status MakeRangeAndTakeDatasetTensor( + const RangeDatasetParams& range_dataset_params, + const TakeDatasetParams& take_dataset_params, + Tensor* range_and_take_dataset_tensor) { + Tensor range_dataset_tensor; + Tensor start = + CreateTensor(TensorShape({}), {range_dataset_params.start}); + Tensor stop = + CreateTensor(TensorShape({}), {range_dataset_params.stop}); + Tensor step = + CreateTensor(TensorShape({}), {range_dataset_params.step}); + TF_RETURN_IF_ERROR(MakeRangeDataset(start, stop, step, {DT_INT64}, + {PartialTensorShape({})}, + &range_dataset_tensor)); + + TF_RETURN_IF_ERROR(MakeTakeDataset( + range_dataset_tensor, take_dataset_params.count, {DT_INT64}, + {PartialTensorShape({})}, range_and_take_dataset_tensor)); + return Status::OK(); + } +}; + +struct TestCase { + RangeDatasetParams range_dataset_params; + TakeDatasetParams take_dataset_params; + Tensor transformations; + std::vector expected_outputs; + DataTypeVector expected_output_dtypes; + std::vector expected_output_shapes; + int64 expected_cardinality; + std::vector breakpoints; +}; + +// Test case 1 : assert one transformation. +TestCase TestCase1() { + return {/*range_dataset_params*/ {/*start*/ 0, /*stop*/ 10, /*step*/ 1}, + /*take_dataset_params*/ {/*count*/ 3}, + /*transformations*/ + DatasetOpsTestBase::CreateTensor( + TensorShape({1}), {TakeDatasetOp::kDatasetType}), + /*expected_outputs*/ + {DatasetOpsTestBase::CreateTensor(TensorShape({}), {0}), + DatasetOpsTestBase::CreateTensor(TensorShape({}), {1}), + DatasetOpsTestBase::CreateTensor(TensorShape({}), {2})}, + /*expected_output_dtypes*/ {DT_INT64}, + /*expected_output_shapes*/ {PartialTensorShape({})}, + /*expected_cardinality*/ 3, + /*breakpoints*/ {0, 2, 5}}; +} + +// Test case 2 : assert two transformations. +TestCase TestCase2() { + return {/*range_dataset_params*/ {/*start*/ 0, /*stop*/ 10, /*step*/ 1}, + /*take_dataset_params*/ {/*count*/ 3}, + /*transformations*/ + DatasetOpsTestBase::CreateTensor( + TensorShape({2}), + {TakeDatasetOp::kDatasetType, RangeDatasetOp::kDatasetType}), + /*expected_outputs*/ + {DatasetOpsTestBase::CreateTensor(TensorShape({}), {0}), + DatasetOpsTestBase::CreateTensor(TensorShape({}), {1}), + DatasetOpsTestBase::CreateTensor(TensorShape({}), {2})}, + /*expected_output_dtypes*/ {DT_INT64}, + /*expected_output_shapes*/ {PartialTensorShape({})}, + /*expected_cardinality*/ 3, + /*breakpoints*/ {0, 2, 5}}; +} + +TestCase AssertNextInvalid() { + return { + /*range_dataset_params*/ {/*start*/ 0, /*stop*/ 10, /*step*/ 1}, + /*take_dataset_params*/ {/*count*/ 3}, + /*transformations*/ + DatasetOpsTestBase::CreateTensor(TensorShape({1}), {"Whoops"}), + /*expected_outputs*/ + {DatasetOpsTestBase::CreateTensor(TensorShape({}), {0}), + DatasetOpsTestBase::CreateTensor(TensorShape({}), {1}), + DatasetOpsTestBase::CreateTensor(TensorShape({}), {2})}, + /*expected_output_dtypes*/ {DT_INT64}, + /*expected_output_shapes*/ {PartialTensorShape({})}, + /*expected_cardinality*/ 3, + /*breakpoints*/ {0, 2, 5}}; +} + +TestCase AssertNextShort() { + return {/*range_dataset_params*/ {/*start*/ 0, /*stop*/ 10, /*step*/ 1}, + /*take_dataset_params*/ {/*count*/ 3}, + /*transformations*/ + DatasetOpsTestBase::CreateTensor( + TensorShape({3}), {TakeDatasetOp::kDatasetType, + RangeDatasetOp::kDatasetType, "Whoops"}), + /*expected_outputs*/ + {DatasetOpsTestBase::CreateTensor(TensorShape({}), {0}), + DatasetOpsTestBase::CreateTensor(TensorShape({}), {1}), + DatasetOpsTestBase::CreateTensor(TensorShape({}), {2})}, + /*expected_output_dtypes*/ {DT_INT64}, + /*expected_output_shapes*/ {PartialTensorShape({})}, + /*expected_cardinality*/ 3, + /*breakpoints*/ {0, 2, 5}}; +} + +class ParameterizedAssertNextDatasetOpTest + : public AssertNextDatasetOpTest, + public ::testing::WithParamInterface {}; + +TEST_P(ParameterizedAssertNextDatasetOpTest, GetNext) { + int thread_num = 2, cpu_num = 2; + TestCase test_case = GetParam(); + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + Tensor range_and_take_dataset_tensor; + TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params, + test_case.take_dataset_params, + &range_and_take_dataset_tensor)); + + std::unique_ptr assert_next_dataset_kernel; + TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes, + test_case.expected_output_shapes, + &assert_next_dataset_kernel)); + Tensor transformations = test_case.transformations; + gtl::InlinedVector inputs( + {TensorValue(&range_and_take_dataset_tensor), + TensorValue(&transformations)}); + std::unique_ptr assert_next_dataset_context; + TF_ASSERT_OK(CreateAssertNextDatasetContext( + assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context)); + + DatasetBase* assert_next_dataset; + TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(), + assert_next_dataset_context.get(), + &assert_next_dataset)); + core::ScopedUnref scoped_unref(assert_next_dataset); + + std::unique_ptr iterator_context; + TF_ASSERT_OK(CreateIteratorContext(assert_next_dataset_context.get(), + &iterator_context)); + std::unique_ptr iterator; + string iterator_prefix = name_utils::IteratorPrefix( + TakeDatasetOp::kDatasetType, + name_utils::IteratorPrefix(RangeDatasetOp::kDatasetType, "Iterator")); + TF_ASSERT_OK(assert_next_dataset->MakeIterator(iterator_context.get(), + iterator_prefix, &iterator)); + + bool end_of_sequence = false; + std::vector out_tensors; + while (!end_of_sequence) { + std::vector next; + TF_EXPECT_OK( + iterator->GetNext(iterator_context.get(), &next, &end_of_sequence)); + out_tensors.insert(out_tensors.end(), next.begin(), next.end()); + } + + TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs, + /*compare_order*/ true)); +} + +TEST_F(AssertNextDatasetOpTest, DatasetNodeName) { + int thread_num = 2, cpu_num = 2; + TestCase test_case = TestCase1(); + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + Tensor range_and_take_dataset_tensor; + TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params, + test_case.take_dataset_params, + &range_and_take_dataset_tensor)); + + std::unique_ptr assert_next_dataset_kernel; + TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes, + test_case.expected_output_shapes, + &assert_next_dataset_kernel)); + Tensor transformations = test_case.transformations; + gtl::InlinedVector inputs( + {TensorValue(&range_and_take_dataset_tensor), + TensorValue(&transformations)}); + std::unique_ptr assert_next_dataset_context; + TF_ASSERT_OK(CreateAssertNextDatasetContext( + assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context)); + + DatasetBase* assert_next_dataset; + TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(), + assert_next_dataset_context.get(), + &assert_next_dataset)); + core::ScopedUnref scoped_unref(assert_next_dataset); + + EXPECT_EQ(assert_next_dataset->node_name(), kNodeName); +} + +TEST_P(ParameterizedAssertNextDatasetOpTest, DatasetTypeString) { + int thread_num = 2, cpu_num = 2; + TestCase test_case = GetParam(); + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + Tensor range_and_take_dataset_tensor; + TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params, + test_case.take_dataset_params, + &range_and_take_dataset_tensor)); + + std::unique_ptr assert_next_dataset_kernel; + TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes, + test_case.expected_output_shapes, + &assert_next_dataset_kernel)); + Tensor transformations = test_case.transformations; + gtl::InlinedVector inputs( + {TensorValue(&range_and_take_dataset_tensor), + TensorValue(&transformations)}); + std::unique_ptr assert_next_dataset_context; + TF_ASSERT_OK(CreateAssertNextDatasetContext( + assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context)); + + DatasetBase* assert_next_dataset; + TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(), + assert_next_dataset_context.get(), + &assert_next_dataset)); + core::ScopedUnref scoped_unref(assert_next_dataset); + + EXPECT_EQ(assert_next_dataset->type_string(), + name_utils::OpName(AssertNextDatasetOp::kDatasetType)); +} + +TEST_P(ParameterizedAssertNextDatasetOpTest, DatasetOutputDtypes) { + int thread_num = 2, cpu_num = 2; + TestCase test_case = GetParam(); + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + Tensor range_and_take_dataset_tensor; + TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params, + test_case.take_dataset_params, + &range_and_take_dataset_tensor)); + + std::unique_ptr assert_next_dataset_kernel; + TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes, + test_case.expected_output_shapes, + &assert_next_dataset_kernel)); + Tensor transformations = test_case.transformations; + gtl::InlinedVector inputs( + {TensorValue(&range_and_take_dataset_tensor), + TensorValue(&transformations)}); + std::unique_ptr assert_next_dataset_context; + TF_ASSERT_OK(CreateAssertNextDatasetContext( + assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context)); + + DatasetBase* assert_next_dataset; + TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(), + assert_next_dataset_context.get(), + &assert_next_dataset)); + core::ScopedUnref scoped_unref(assert_next_dataset); + + TF_EXPECT_OK(VerifyTypesMatch(assert_next_dataset->output_dtypes(), + test_case.expected_output_dtypes)); +} + +TEST_P(ParameterizedAssertNextDatasetOpTest, DatasetOutputShapes) { + int thread_num = 2, cpu_num = 2; + TestCase test_case = GetParam(); + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + Tensor range_and_take_dataset_tensor; + TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params, + test_case.take_dataset_params, + &range_and_take_dataset_tensor)); + + std::unique_ptr assert_next_dataset_kernel; + TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes, + test_case.expected_output_shapes, + &assert_next_dataset_kernel)); + Tensor transformations = test_case.transformations; + gtl::InlinedVector inputs( + {TensorValue(&range_and_take_dataset_tensor), + TensorValue(&transformations)}); + std::unique_ptr assert_next_dataset_context; + TF_ASSERT_OK(CreateAssertNextDatasetContext( + assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context)); + + DatasetBase* assert_next_dataset; + TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(), + assert_next_dataset_context.get(), + &assert_next_dataset)); + core::ScopedUnref scoped_unref(assert_next_dataset); + + TF_EXPECT_OK(VerifyShapesCompatible(assert_next_dataset->output_shapes(), + test_case.expected_output_shapes)); +} + +TEST_P(ParameterizedAssertNextDatasetOpTest, Cardinality) { + int thread_num = 2, cpu_num = 2; + TestCase test_case = GetParam(); + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + Tensor range_and_take_dataset_tensor; + TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params, + test_case.take_dataset_params, + &range_and_take_dataset_tensor)); + + std::unique_ptr assert_next_dataset_kernel; + TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes, + test_case.expected_output_shapes, + &assert_next_dataset_kernel)); + Tensor transformations = test_case.transformations; + gtl::InlinedVector inputs( + {TensorValue(&range_and_take_dataset_tensor), + TensorValue(&transformations)}); + std::unique_ptr assert_next_dataset_context; + TF_ASSERT_OK(CreateAssertNextDatasetContext( + assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context)); + + DatasetBase* assert_next_dataset; + TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(), + assert_next_dataset_context.get(), + &assert_next_dataset)); + core::ScopedUnref scoped_unref(assert_next_dataset); + + EXPECT_EQ(assert_next_dataset->Cardinality(), test_case.expected_cardinality); +} + +TEST_P(ParameterizedAssertNextDatasetOpTest, DatasetSave) { + int thread_num = 2, cpu_num = 2; + TestCase test_case = GetParam(); + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + Tensor range_and_take_dataset_tensor; + TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params, + test_case.take_dataset_params, + &range_and_take_dataset_tensor)); + + std::unique_ptr assert_next_dataset_kernel; + TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes, + test_case.expected_output_shapes, + &assert_next_dataset_kernel)); + Tensor transformations = test_case.transformations; + gtl::InlinedVector inputs( + {TensorValue(&range_and_take_dataset_tensor), + TensorValue(&transformations)}); + std::unique_ptr assert_next_dataset_context; + TF_ASSERT_OK(CreateAssertNextDatasetContext( + assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context)); + + DatasetBase* assert_next_dataset; + TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(), + assert_next_dataset_context.get(), + &assert_next_dataset)); + core::ScopedUnref scoped_unref(assert_next_dataset); + + std::unique_ptr serialization_context; + TF_ASSERT_OK(CreateSerializationContext(&serialization_context)); + VariantTensorData data; + VariantTensorDataWriter writer(&data); + TF_ASSERT_OK(assert_next_dataset->Save(serialization_context.get(), &writer)); + TF_ASSERT_OK(writer.Flush()); +} + +TEST_P(ParameterizedAssertNextDatasetOpTest, IteratorOutputDtypes) { + int thread_num = 2, cpu_num = 2; + TestCase test_case = GetParam(); + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + Tensor range_and_take_dataset_tensor; + TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params, + test_case.take_dataset_params, + &range_and_take_dataset_tensor)); + + std::unique_ptr assert_next_dataset_kernel; + TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes, + test_case.expected_output_shapes, + &assert_next_dataset_kernel)); + Tensor transformations = test_case.transformations; + gtl::InlinedVector inputs( + {TensorValue(&range_and_take_dataset_tensor), + TensorValue(&transformations)}); + std::unique_ptr assert_next_dataset_context; + TF_ASSERT_OK(CreateAssertNextDatasetContext( + assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context)); + + DatasetBase* assert_next_dataset; + TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(), + assert_next_dataset_context.get(), + &assert_next_dataset)); + core::ScopedUnref scoped_unref(assert_next_dataset); + + std::unique_ptr iterator_context; + TF_ASSERT_OK(CreateIteratorContext(assert_next_dataset_context.get(), + &iterator_context)); + std::unique_ptr iterator; + string iterator_prefix = name_utils::IteratorPrefix( + TakeDatasetOp::kDatasetType, + name_utils::IteratorPrefix(RangeDatasetOp::kDatasetType, "Iterator")); + TF_ASSERT_OK(assert_next_dataset->MakeIterator(iterator_context.get(), + iterator_prefix, &iterator)); + + TF_EXPECT_OK(VerifyTypesMatch(iterator->output_dtypes(), + test_case.expected_output_dtypes)); +} + +TEST_P(ParameterizedAssertNextDatasetOpTest, IteratorOutputShapes) { + int thread_num = 2, cpu_num = 2; + TestCase test_case = GetParam(); + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + Tensor range_and_take_dataset_tensor; + TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params, + test_case.take_dataset_params, + &range_and_take_dataset_tensor)); + + std::unique_ptr assert_next_dataset_kernel; + TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes, + test_case.expected_output_shapes, + &assert_next_dataset_kernel)); + Tensor transformations = test_case.transformations; + gtl::InlinedVector inputs( + {TensorValue(&range_and_take_dataset_tensor), + TensorValue(&transformations)}); + std::unique_ptr assert_next_dataset_context; + TF_ASSERT_OK(CreateAssertNextDatasetContext( + assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context)); + + DatasetBase* assert_next_dataset; + TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(), + assert_next_dataset_context.get(), + &assert_next_dataset)); + core::ScopedUnref scoped_unref(assert_next_dataset); + + std::unique_ptr iterator_context; + TF_ASSERT_OK(CreateIteratorContext(assert_next_dataset_context.get(), + &iterator_context)); + std::unique_ptr iterator; + string iterator_prefix = name_utils::IteratorPrefix( + TakeDatasetOp::kDatasetType, + name_utils::IteratorPrefix(RangeDatasetOp::kDatasetType, "Iterator")); + TF_ASSERT_OK(assert_next_dataset->MakeIterator(iterator_context.get(), + iterator_prefix, &iterator)); + + TF_EXPECT_OK(VerifyShapesCompatible(iterator->output_shapes(), + test_case.expected_output_shapes)); +} + +TEST_P(ParameterizedAssertNextDatasetOpTest, IteratorOutputPrefix) { + int thread_num = 2, cpu_num = 2; + TestCase test_case = GetParam(); + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + Tensor range_and_take_dataset_tensor; + TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params, + test_case.take_dataset_params, + &range_and_take_dataset_tensor)); + + std::unique_ptr assert_next_dataset_kernel; + TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes, + test_case.expected_output_shapes, + &assert_next_dataset_kernel)); + Tensor transformations = test_case.transformations; + gtl::InlinedVector inputs( + {TensorValue(&range_and_take_dataset_tensor), + TensorValue(&transformations)}); + std::unique_ptr assert_next_dataset_context; + TF_ASSERT_OK(CreateAssertNextDatasetContext( + assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context)); + + DatasetBase* assert_next_dataset; + TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(), + assert_next_dataset_context.get(), + &assert_next_dataset)); + core::ScopedUnref scoped_unref(assert_next_dataset); + + std::unique_ptr iterator_context; + TF_ASSERT_OK(CreateIteratorContext(assert_next_dataset_context.get(), + &iterator_context)); + std::unique_ptr iterator; + string iterator_prefix = name_utils::IteratorPrefix( + TakeDatasetOp::kDatasetType, + name_utils::IteratorPrefix(RangeDatasetOp::kDatasetType, "Iterator")); + TF_ASSERT_OK(assert_next_dataset->MakeIterator(iterator_context.get(), + iterator_prefix, &iterator)); + + EXPECT_EQ(iterator->prefix(), + name_utils::IteratorPrefix(AssertNextDatasetOp::kDatasetType, + iterator_prefix)); +} + +TEST_P(ParameterizedAssertNextDatasetOpTest, Roundtrip) { + int thread_num = 2, cpu_num = 2; + TestCase test_case = GetParam(); + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + Tensor range_and_take_dataset_tensor; + TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params, + test_case.take_dataset_params, + &range_and_take_dataset_tensor)); + + std::unique_ptr assert_next_dataset_kernel; + TF_ASSERT_OK(CreateAssertNextDatasetOpKernel(test_case.expected_output_dtypes, + test_case.expected_output_shapes, + &assert_next_dataset_kernel)); + Tensor transformations = test_case.transformations; + gtl::InlinedVector inputs( + {TensorValue(&range_and_take_dataset_tensor), + TensorValue(&transformations)}); + std::unique_ptr assert_next_dataset_context; + TF_ASSERT_OK(CreateAssertNextDatasetContext( + assert_next_dataset_kernel.get(), &inputs, &assert_next_dataset_context)); + + DatasetBase* assert_next_dataset; + TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(), + assert_next_dataset_context.get(), + &assert_next_dataset)); + core::ScopedUnref scoped_unref(assert_next_dataset); + + std::unique_ptr iterator_context; + TF_ASSERT_OK(CreateIteratorContext(assert_next_dataset_context.get(), + &iterator_context)); + std::unique_ptr iterator; + string iterator_prefix = name_utils::IteratorPrefix( + TakeDatasetOp::kDatasetType, + name_utils::IteratorPrefix(RangeDatasetOp::kDatasetType, "Iterator")); + TF_ASSERT_OK(assert_next_dataset->MakeIterator(iterator_context.get(), + iterator_prefix, &iterator)); + + std::unique_ptr serialization_ctx; + TF_ASSERT_OK(CreateSerializationContext(&serialization_ctx)); + bool end_of_sequence = false; + std::vector out_tensors; + int cur_iteration = 0; + const std::vector& breakpoints = test_case.breakpoints; + for (int breakpoint : breakpoints) { + VariantTensorData data; + VariantTensorDataWriter writer(&data); + TF_EXPECT_OK(iterator->Save(serialization_ctx.get(), &writer)); + TF_EXPECT_OK(writer.Flush()); + VariantTensorDataReader reader(&data); + TF_EXPECT_OK(RestoreIterator(iterator_context.get(), &reader, + iterator_prefix, *assert_next_dataset, + &iterator)); + + while (cur_iteration <= breakpoint) { + std::vector next; + TF_EXPECT_OK( + iterator->GetNext(iterator_context.get(), &next, &end_of_sequence)); + out_tensors.insert(out_tensors.end(), next.begin(), next.end()); + ++cur_iteration; + } + } + + TF_EXPECT_OK(ExpectEqual(out_tensors, test_case.expected_outputs, + /*compare_order*/ true)); +} + +INSTANTIATE_TEST_SUITE_P( + AssertNextDatasetOpTest, ParameterizedAssertNextDatasetOpTest, + ::testing::ValuesIn(std::vector({TestCase1(), TestCase2()}))); + +TEST_F(AssertNextDatasetOpTest, InvalidArguments) { + int thread_num = 2, cpu_num = 2; + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + std::vector test_cases = {AssertNextInvalid(), AssertNextShort()}; + for (TestCase test_case : test_cases) { + Tensor range_and_take_dataset_tensor; + TF_ASSERT_OK(MakeRangeAndTakeDatasetTensor(test_case.range_dataset_params, + test_case.take_dataset_params, + &range_and_take_dataset_tensor)); + + std::unique_ptr assert_next_dataset_kernel; + TF_ASSERT_OK(CreateAssertNextDatasetOpKernel( + test_case.expected_output_dtypes, test_case.expected_output_shapes, + &assert_next_dataset_kernel)); + Tensor transformations = test_case.transformations; + gtl::InlinedVector inputs( + {TensorValue(&range_and_take_dataset_tensor), + TensorValue(&transformations)}); + std::unique_ptr assert_next_dataset_context; + TF_ASSERT_OK( + CreateAssertNextDatasetContext(assert_next_dataset_kernel.get(), + &inputs, &assert_next_dataset_context)); + + DatasetBase* assert_next_dataset; + TF_ASSERT_OK(CreateDataset(assert_next_dataset_kernel.get(), + assert_next_dataset_context.get(), + &assert_next_dataset)); + core::ScopedUnref scoped_unref(assert_next_dataset); + + std::unique_ptr iterator_context; + TF_ASSERT_OK(CreateIteratorContext(assert_next_dataset_context.get(), + &iterator_context)); + std::unique_ptr iterator; + string iterator_prefix = name_utils::IteratorPrefix( + TakeDatasetOp::kDatasetType, + name_utils::IteratorPrefix(RangeDatasetOp::kDatasetType, "Iterator")); + EXPECT_EQ( + assert_next_dataset + ->MakeIterator(iterator_context.get(), iterator_prefix, &iterator) + .code(), + tensorflow::error::INVALID_ARGUMENT); + } +} + +} // namespace +} // namespace data +} // namespace tensorflow From 2f72324661d4a2f8f66586f15f17ad7ffb82ff95 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Fri, 19 Jul 2019 20:36:23 +0000 Subject: [PATCH 0184/3053] Cast start, limit and delta directly, if dtype is known, based on review feedback Signed-off-by: Yong Tang --- tensorflow/python/ops/math_ops.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 114df461a8b..2b6267fc635 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -1349,28 +1349,20 @@ def range(start, limit=None, delta=1, dtype=None, name="range"): # pylint: disa start, limit = 0, start with ops.name_scope(name, "Range", [start, limit, delta]) as name: - # In case start, limit, or delta is already a tensor and have different - # dtype with the specified dtype, try to do a cast to see if the dtype is - # compatible. Otherwise pass to convert_to_tensor. This is to handle + # In case dtype is not none, cast start, limit, and delta directly. + # Otherwise pass to convert_to_tensor. This is to handle # the situation with: # tf.range(tf.constant(5), dtype=tf.float32) # which is comparable with: # np.arange(np.int(5), dtype=np.float32) - if (isinstance(start, ops.Tensor) and - dtype is not None and dtype != start.dtype): + if dtype is not None: start = cast(start, dtype=dtype) - else: - start = ops.convert_to_tensor(start, dtype=dtype, name="start") - if (isinstance(limit, ops.Tensor) and - dtype is not None and dtype != limit.dtype): limit = cast(limit, dtype=dtype) - else: - limit = ops.convert_to_tensor(limit, dtype=dtype, name="limit") - if (isinstance(delta, ops.Tensor) and - dtype is not None and dtype != delta.dtype): delta = cast(delta, dtype=dtype) else: - delta = ops.convert_to_tensor(delta, dtype=dtype, name="delta") + start = ops.convert_to_tensor(start, name="start") + limit = ops.convert_to_tensor(limit, name="limit") + delta = ops.convert_to_tensor(delta, name="delta") # infer dtype if not explicitly provided if dtype is None: From 24297a4cb9120351643f7ac3916e7398236ccc0d Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Fri, 19 Jul 2019 13:41:25 -0700 Subject: [PATCH 0185/3053] use padded IO for cudnn rnn only when necessary --- tensorflow/core/kernels/cudnn_rnn_ops.cc | 42 +++++++++++++++---- tensorflow/stream_executor/cuda/cuda_dnn.cc | 13 +++--- tensorflow/stream_executor/cuda/cuda_dnn.h | 3 +- tensorflow/stream_executor/dnn.h | 4 +- .../stream_executor/stream_executor_pimpl.cc | 5 ++- .../stream_executor/stream_executor_pimpl.h | 3 +- 6 files changed, 52 insertions(+), 18 deletions(-) diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc index 09826f57ce5..1daadd2f9f1 100644 --- a/tensorflow/core/kernels/cudnn_rnn_ops.cc +++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc @@ -1027,7 +1027,7 @@ class CudnnRNNKernelCommon : public OpKernel { num_layers, h_num_units, input_size, /*cell_size=*/c_num_units, /*batch_size=*/0, input_mode, rnn_direction_mode(), rnn_mode(), ToDataType::value, algo_config, dropout(), seed(), - /* state_allocator=*/nullptr); + /* state_allocator=*/nullptr, /*use_padded_io=*/false); if (!rnn_desc_s.ok()) { return FromExecutorStatus(rnn_desc_s); } @@ -1041,14 +1041,16 @@ class CudnnRNNKernelCommon : public OpKernel { const RnnInputMode& input_mode, const AlgorithmConfig& algo_config, ScratchAllocator* dropout_state_allocator, - std::unique_ptr* rnn_desc) { + std::unique_ptr* rnn_desc, + bool use_padded_io) { StreamExecutor* executor = context->op_device_context()->stream()->parent(); se::dnn::DataType data_type = ToDataType::value; auto rnn_desc_s = executor->createRnnDescriptor( model_shapes.num_layers, model_shapes.num_units, model_shapes.input_size, model_shapes.cell_num_units, model_shapes.batch_size, input_mode, rnn_direction_mode(), rnn_mode(), - data_type, algo_config, dropout(), seed(), dropout_state_allocator); + data_type, algo_config, dropout(), seed(), dropout_state_allocator, + use_padded_io); TF_RETURN_IF_ERROR(rnn_desc_s.status()); *rnn_desc = rnn_desc_s.ConsumeValueOrDie(); @@ -1066,7 +1068,8 @@ class CudnnRNNKernelCommon : public OpKernel { const RnnInputMode& input_mode, const AlgorithmConfig& algo_config, RnnStateCache* cache, - RnnDescriptor** rnn_desc) { + RnnDescriptor** rnn_desc, + bool use_padded_io) { auto key = std::make_pair(model_shapes, algo_config.algorithm()); RnnScratchSpace& rnn_state = (*cache)[key]; if (rnn_state.rnn_desc == nullptr || ResetRndGenState()) { @@ -1075,7 +1078,8 @@ class CudnnRNNKernelCommon : public OpKernel { rnn_state.dropout_state_allocator.reset(dropout_state_allocator); Status status = CreateRnnDescriptor(context, model_shapes, input_mode, algo_config, - dropout_state_allocator, &rnn_state.rnn_desc); + dropout_state_allocator, &rnn_state.rnn_desc, + use_padded_io); TF_RETURN_IF_ERROR(status); } *rnn_desc = rnn_state.rnn_desc.get(); @@ -1444,11 +1448,21 @@ class CudnnRNNForwardOp : public CudnnRNNKernelCommon { const Tensor* params = nullptr; const Tensor* sequence_lengths = nullptr; CudnnRnnModelShapes model_shapes; + bool use_padded_io = false; if (var_seq_lengths) { OP_REQUIRES_OK(context, ExtractForwardInput( context, model_types(), time_major, &input, &input_h, &input_c, ¶ms, &sequence_lengths, num_proj, &model_shapes)); + auto seq_array = sequence_lengths->template flat().data(); + bool all_max_seq_length = true; + for (int i = 0; i < model_shapes.batch_size; i++) { + if (seq_array[i] != model_shapes.max_seq_length) { + all_max_seq_length = false; + break; + } + } + use_padded_io = !(time_major && all_max_seq_length); } else { OP_REQUIRES_OK(context, ExtractForwardInput(context, model_types(), time_major, @@ -1491,7 +1505,8 @@ class CudnnRNNForwardOp : public CudnnRNNKernelCommon { OP_REQUIRES_OK( context, GetCachedRnnDescriptor(context, model_shapes, input_mode, *output_algo_config, - &rnn_state_cache_, &rnn_desc_ptr)); + &rnn_state_cache_, &rnn_desc_ptr, + use_padded_io)); launch_status = DoForward( context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h, input_c, params, is_training_, output, output_h, output_c, @@ -1690,7 +1705,8 @@ class CudnnRNNForwardOpV2 CudnnRnnAllocatorInTemp dropout_state_allocator(context); if (!this->template CreateRnnDescriptor( context, model_shapes, input_mode, AlgorithmConfig(algo), - &dropout_state_allocator, &rnn_desc) + &dropout_state_allocator, &rnn_desc, + /*use_padded_io=*/false) .ok()) { continue; } @@ -1840,11 +1856,21 @@ class CudnnRNNBackwardOp : public CudnnRNNKernelCommon { const Tensor* params = nullptr; const Tensor* sequence_lengths = nullptr; CudnnRnnModelShapes model_shapes; + bool use_padded_io = false; if (var_seq_lengths) { OP_REQUIRES_OK(context, ExtractForwardInput( context, model_types(), time_major, &input, &input_h, &input_c, ¶ms, &sequence_lengths, num_proj, &model_shapes)); + auto seq_array = sequence_lengths->template flat().data(); + bool all_max_seq_length = true; + for (int i = 0; i < model_shapes.batch_size; i++) { + if (seq_array[i] != model_shapes.max_seq_length) { + all_max_seq_length = false; + break; + } + } + use_padded_io = !(time_major && all_max_seq_length); } else { OP_REQUIRES_OK(context, ExtractForwardInput(context, model_types(), time_major, @@ -1890,7 +1916,7 @@ class CudnnRNNBackwardOp : public CudnnRNNKernelCommon { OP_REQUIRES_OK( context, GetCachedRnnDescriptor(context, model_shapes, input_mode, algo_config, &rnn_state_cache_, - &rnn_desc_ptr)); + &rnn_desc_ptr, use_padded_io)); launch_status = DoBackward( context, *rnn_desc_ptr, model_types(), model_shapes, input, input_h, input_c, params, output, output_h, output_c, output_backprop, diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index 4e900b41881..ed112e4aa4a 100755 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -1043,7 +1043,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor { cudnnDirectionMode_t direction_mode, cudnnRNNMode_t rnn_mode, cudnnDataType_t data_type, cudnnDataType_t compute_type, const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed, - ScratchAllocator* state_allocator) { + ScratchAllocator* state_allocator, bool use_padded_io) { SE_ASSIGN_OR_RETURN( CudnnDropoutDescriptor dropout_desc, CudnnDropoutDescriptor::Create(cudnn, dropout, seed, state_allocator)); @@ -1079,8 +1079,10 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor { // But in the future if these APIs are used to process full length arrays, // we need to distinguish when to set it. #if CUDNN_VERSION >= 7201 - RETURN_IF_CUDNN_ERROR( - cudnnSetRNNPaddingMode(rnn_desc.get(), CUDNN_RNN_PADDED_IO_ENABLED)); + if (use_padded_io) { + RETURN_IF_CUDNN_ERROR( + cudnnSetRNNPaddingMode(rnn_desc.get(), CUDNN_RNN_PADDED_IO_ENABLED)); + } #endif port::StatusOr rnn_plan_wrapper; @@ -1974,7 +1976,8 @@ CudnnSupport::createRnnDescriptor( int batch_size, dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode, dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config, - float dropout, uint64 seed, ScratchAllocator* state_allocator) { + float dropout, uint64 seed, ScratchAllocator* state_allocator, + bool use_padded_io) { // Setting up a cudnnRNNDescriptor requires a cuDNN handle, but because it's // not enqueueing anything into a stream, we pass in the null stream. auto cudnn = cudnn_->GetHandle(parent_, /*stream=*/nullptr); @@ -1985,7 +1988,7 @@ CudnnSupport::createRnnDescriptor( ToCudnnRnnInputMode(input_mode), ToCudnnRnnDirectionMode(direction_mode), ToCudnnRnnMode(rnn_mode), ToCudnnDataType(data_type), GetRnnComputeType(data_type), - algorithm_config, dropout, seed, state_allocator)); + algorithm_config, dropout, seed, state_allocator, use_padded_io)); return std::unique_ptr( new CudnnRnnDescriptor(std::move(rnn_desc))); } diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h index e3742c07a56..482e86135d9 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.h +++ b/tensorflow/stream_executor/cuda/cuda_dnn.h @@ -51,7 +51,8 @@ class CudnnSupport : public dnn::DnnSupport { int batch_size, dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode, dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config, - float dropout, uint64 seed, ScratchAllocator* state_allocator) override; + float dropout, uint64 seed, ScratchAllocator* state_allocator, + bool use_padded_io) override; port::StatusOr> createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size, diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h index 7837c8e3b69..a8358379135 100644 --- a/tensorflow/stream_executor/dnn.h +++ b/tensorflow/stream_executor/dnn.h @@ -2095,6 +2095,7 @@ class DnnSupport { // state_allocator: an memory allocator that will be used to store the state // for dropout layer. The user has to maintain the memory until the model // is no longer in use. + // use_padded_io: a bool to specify whether the input is using padded IO. virtual port::StatusOr> createRnnDescriptor(int num_layers, int hidden_size, int input_size, int cell_size, int batch_size, @@ -2103,7 +2104,8 @@ class DnnSupport { dnn::RnnMode rnn_mode, dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed, - ScratchAllocator* state_allocator) { + ScratchAllocator* state_allocator, + bool use_padded_io) { return port::Status(port::error::UNIMPLEMENTED, "createRnnDescriptor is unimplemented"); } diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc index 839f1cd20be..85da0593cd2 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.cc +++ b/tensorflow/stream_executor/stream_executor_pimpl.cc @@ -340,7 +340,8 @@ StreamExecutor::createRnnDescriptor( int batch_size, dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode, dnn::DataType data_type, const dnn::AlgorithmConfig &algorithm_config, - float dropout, uint64 seed, ScratchAllocator *state_allocator) { + float dropout, uint64 seed, ScratchAllocator *state_allocator, + bool use_padded_io) { dnn::DnnSupport *dnn_support = AsDnn(); if (!dnn_support) { return port::Status(port::error::UNKNOWN, @@ -349,7 +350,7 @@ StreamExecutor::createRnnDescriptor( return dnn_support->createRnnDescriptor( num_layers, hidden_size, input_size, cell_size, batch_size, input_mode, direction_mode, rnn_mode, data_type, algorithm_config, dropout, seed, - state_allocator); + state_allocator, use_padded_io); } port::StatusOr> diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h index d2f2f591e2a..962bea4d0bc 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.h +++ b/tensorflow/stream_executor/stream_executor_pimpl.h @@ -398,7 +398,8 @@ class StreamExecutor { int batch_size, dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode, dnn::DataType data_type, const dnn::AlgorithmConfig &algorithm_config, - float dropout, uint64 seed, ScratchAllocator *state_allocator); + float dropout, uint64 seed, ScratchAllocator *state_allocator, + bool use_padded_io); // Create a RNN sequence descriptor that specifies either the input or output // sequence. The caller retains the ownership of the returned descriptor. From 95d872a7cb49795574f18a591826361cbf26464a Mon Sep 17 00:00:00 2001 From: Nupur Garg Date: Fri, 19 Jul 2019 13:45:33 -0700 Subject: [PATCH 0186/3053] Add support for freezing the Switch op when it is used with resource variables. PiperOrigin-RevId: 259030554 --- .../python/framework/graph_util_impl.py | 36 +- .../python/framework/graph_util_test.py | 378 ++++++++++-------- 2 files changed, 225 insertions(+), 189 deletions(-) diff --git a/tensorflow/python/framework/graph_util_impl.py b/tensorflow/python/framework/graph_util_impl.py index 59621a0bc2a..5c131abbcb1 100644 --- a/tensorflow/python/framework/graph_util_impl.py +++ b/tensorflow/python/framework/graph_util_impl.py @@ -126,6 +126,12 @@ def _extract_graph_summary(graph_def): n = _node_name(node.name) name_to_node[n] = node name_to_input_name[n] = [_node_name(x) for x in node.input] + # Prevent colocated nodes from being lost. + if "_class" in node.attr: + for colocated_node_name in node.attr["_class"].list.s: + colocated_node_decoded = colocated_node_name.decode("utf-8") + if colocated_node_decoded.startswith("loc:@"): + name_to_input_name[n].append(colocated_node_decoded[5:]) name_to_seq_num[n] = seq seq += 1 return name_to_input_name, name_to_node, name_to_seq_num @@ -243,15 +249,7 @@ def convert_variables_to_constants(sess, GraphDef containing a simplified version of the original. """ - def get_input_name(node): - """Gets the name of the first input. Errors if suffix is not :0.""" - details = node.input[0].split(":") - if len(details) == 1 or int(details[1]) == 0: - return details[0] - # While it is valid for input tensors to have a suffix that is not :0, this - # method is used to find the associated ops, not tensors, and therefore it - # is not valid. - raise ValueError("Tensor name '{0}' is invalid.".format(node.input[0])) + get_input_name = lambda node: node.input[0].split(":")[0] def create_const_op(node_name, dtype, data, data_shape=None): """Creates a Const op.""" @@ -277,7 +275,7 @@ def convert_variables_to_constants(sess, # Get list of variables. variable_names = [] variable_dict_names = [] - resource_identity_types = {} + resource_op_types = {} for node in inference_graph.node: if node.op in ["Variable", "VariableV2", "VarHandleOp"]: variable_name = node.name @@ -292,11 +290,12 @@ def convert_variables_to_constants(sess, else: variable_names.append(variable_name + ":0") elif node.op in ["ReadVariableOp", "ResourceGather"]: - # There can be one or more Identity ops in between the ReadVariableOp and - # VarHandleOp. Store the Identity ops with the associated dtypes. + # There can be one or more Identity or Switch ops in between the + # ReadVariableOp and VarHandleOp. Store the ops with the associated + # dtypes. source_op_name = get_input_name(node) - while map_name_to_node[source_op_name].op == "Identity": - resource_identity_types[source_op_name] = node.attr["dtype"] + while map_name_to_node[source_op_name].op in ["Identity", "Switch"]: + resource_op_types[source_op_name] = node.attr["dtype"] source_op_name = get_input_name(map_name_to_node[source_op_name]) if map_name_to_node[source_op_name].op != "VarHandleOp": raise ValueError("Cannot find the variable that is an input " @@ -320,11 +319,12 @@ def convert_variables_to_constants(sess, output_node = create_const_op(input_node.name, input_node.attr["dtype"], data, data.shape) how_many_converted += 1 - elif input_node.name in resource_identity_types: - # Converts the Identities of type RESOURCE_DT to the appropriate type - # based on the input they are referencing. + elif input_node.name in resource_op_types: + # Converts the type of the ops between the ReadVariableOp and VarHandleOp + # from RESOURCE_DT to the appropriate type based on the input they are + # referencing. output_node.CopyFrom(input_node) - output_node.attr["T"].CopyFrom(resource_identity_types[input_node.name]) + output_node.attr["T"].CopyFrom(resource_op_types[input_node.name]) elif input_node.op == "ReadVariableOp": # The first branch converts all VarHandleOps of ResourceVariables to # constants, so we need to convert the associated ReadVariableOps to diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py index 6a5a779ca03..d7626e90764 100644 --- a/tensorflow/python/framework/graph_util_test.py +++ b/tensorflow/python/framework/graph_util_test.py @@ -36,6 +36,8 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_util from tensorflow.python.framework import test_util from tensorflow.python.grappler import tf_optimizer +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import gen_math_ops from tensorflow.python.ops import gen_state_ops from tensorflow.python.ops import math_ops # pylint: disable=unused-import from tensorflow.python.ops import math_ops as math_ops_lib @@ -205,54 +207,119 @@ class DeviceFunctionsTest(test.TestCase): with self.assertRaisesRegexp(TypeError, "must be a list"): graph_util.extract_sub_graph(graph_def, "n1") - def _test_convert_variables_with_functions(self, inline_functions): - """Freezes a graph with functions.""" + def create_node_def(self, op, name, inputs): + new_node = node_def_pb2.NodeDef() + new_node.op = op + new_node.name = name + new_node.input.extend(inputs) + return new_node - @function.Defun(dtypes.float32) - def plus_one(x): - return x + 1.0 + def create_constant_node_def(self, + name, + value, + dtype, + shape=None, + inputs=None): + node = self.create_node_def("Const", name, inputs or []) + self.set_attr_dtype(node, "dtype", dtype) + self.set_attr_tensor(node, "value", value, dtype, shape) + return node - with ops.Graph().as_default(): - variable_node = variables.Variable(1.0, name="variable_node") - _ = variables.Variable(1.0, name="unused_variable_node") - defun_node = plus_one(variable_node) - _ = math_ops_lib.multiply(defun_node, 2.0, name="output_node") + def set_attr_dtype(self, node, key, value): + node.attr[key].CopyFrom( + attr_value_pb2.AttrValue(type=value.as_datatype_enum)) - with session.Session() as sess: - self.evaluate(variables.variables_initializer([variable_node])) - variable_graph_def = sess.graph.as_graph_def() + def set_attr_tensor(self, node, key, value, dtype, shape=None): + node.attr[key].CopyFrom( + attr_value_pb2.AttrValue( + tensor=tensor_util.make_tensor_proto( + value, dtype=dtype, shape=shape))) - if inline_functions: - # Run Grappler to create the VarOpHandle --> Placeholder --> - # ResourceVariable pattern. - meta_graph = export_meta_graph(graph_def=variable_graph_def) - fetch_collection = meta_graph_pb2.CollectionDef() - for name in ["variable_node", "output_node"]: - fetch_collection.node_list.value.append(name) - meta_graph.collection_def["train_op"].CopyFrom(fetch_collection) + def testRemoveTrainingNodes(self): + a_constant_name = "a_constant" + b_constant_name = "b_constant" + a_check_name = "a_check" + b_check_name = "b_check" + a_identity_name = "a_identity" + b_identity_name = "b_identity" + add_name = "add" + graph_def = graph_pb2.GraphDef() + a_constant = self.create_constant_node_def( + a_constant_name, value=1, dtype=dtypes.float32, shape=[]) + graph_def.node.extend([a_constant]) + a_check_node = self.create_node_def("CheckNumerics", a_check_name, + [a_constant_name]) + graph_def.node.extend([a_check_node]) + a_identity_node = self.create_node_def( + "Identity", a_identity_name, [a_constant_name, "^" + a_check_name]) + graph_def.node.extend([a_identity_node]) + b_constant = self.create_constant_node_def( + b_constant_name, value=1, dtype=dtypes.float32, shape=[]) + graph_def.node.extend([b_constant]) + b_check_node = self.create_node_def("CheckNumerics", b_check_name, + [b_constant_name]) + graph_def.node.extend([b_check_node]) + b_identity_node = self.create_node_def( + "Identity", b_identity_name, [b_constant_name, "^" + b_check_name]) + graph_def.node.extend([b_identity_node]) + add_node = self.create_node_def("Add", add_name, + [a_identity_name, b_identity_name]) + self.set_attr_dtype(add_node, "T", dtypes.float32) + graph_def.node.extend([add_node]) - # Initialize RewriterConfig with everything disabled except function - # inlining. - config = config_pb2.ConfigProto() - rewrite_options = config.graph_options.rewrite_options - rewrite_options.optimizers.append("function") - variable_graph_def = tf_optimizer.OptimizeGraph(config, meta_graph) + expected_output = graph_pb2.GraphDef() + a_constant = self.create_constant_node_def( + a_constant_name, value=1, dtype=dtypes.float32, shape=[]) + expected_output.node.extend([a_constant]) + b_constant = self.create_constant_node_def( + b_constant_name, value=1, dtype=dtypes.float32, shape=[]) + expected_output.node.extend([b_constant]) + add_node = self.create_node_def("Add", add_name, + [a_constant_name, b_constant_name]) + self.set_attr_dtype(add_node, "T", dtypes.float32) + expected_output.node.extend([add_node]) - constant_graph_def = graph_util.convert_variables_to_constants( - sess, variable_graph_def, ["output_node"]) + output = graph_util.remove_training_nodes(graph_def) + self.assertProtoEquals(expected_output, output) - # Ensure there are no variables after freezing. - for node in constant_graph_def.node: - self.assertNotIn( - node.op, ["Variable", "VariableV2", "VarHandleOp", "ReadVariableOp"]) + def testRemoveIdentityChains(self): + """Check that chains of Identity nodes are correctly pruned. - def testConvertVariablesToConstsWithFunctions(self): - """Freezes a graph with functions.""" - self._test_convert_variables_with_functions(inline_functions=False) + Create a chain of four nodes, A, B, C, and D where A inputs B, B inputs C, + and C inputs D. Nodes B and C are "Identity" and should be pruned, resulting + in the nodes A and D, where A inputs D. + """ + graph_def = graph_pb2.GraphDef() + graph_def.node.extend([ + self.create_node_def("Aop", "A", ["B"]), + self.create_node_def("Identity", "B", ["C"]), + self.create_node_def("Identity", "C", ["D"]), + self.create_node_def("Dop", "D", []) + ]) - def testConvertVariableToConstsWithFunctionsInlined(self): - """Freezes a graph with functions that have been inlined using Grappler.""" - self._test_convert_variables_with_functions(inline_functions=True) + expected_graph_def = graph_pb2.GraphDef() + expected_graph_def.node.extend([ + self.create_node_def("Aop", "A", ["D"]), + self.create_node_def("Dop", "D", []) + ]) + + self.assertProtoEquals(expected_graph_def, + graph_util.remove_training_nodes(graph_def)) + + def testRemoveIdentityUsedAsControlInputInConst(self): + """Check that Identity nodes used as control inputs are not removed.""" + graph_def = graph_pb2.GraphDef() + graph_def.node.extend([ + self.create_constant_node_def("C", 1, dtypes.float32, inputs=["^I"]), + self.create_node_def("Identity", "I", ["Base"]), + self.create_node_def("BaseOp", "Base", []) + ]) + + self.assertProtoEquals(graph_def, + graph_util.remove_training_nodes(graph_def)) + + +class ConvertVariablesToConstantsTest(test.TestCase): def _get_tensors(self, sess, tensor_list): """Returns a list of Tensor objects from the Session.""" @@ -271,45 +338,6 @@ class DeviceFunctionsTest(test.TestCase): return sess.run( output_tensors, feed_dict=dict(zip(input_tensors, input_data))) - @test_util.run_v1_only("Incompatible with TF 2.0") - def testConvertVariablesToConstsWithEmbeddings(self): - """Freezes a graph with embeddings.""" - input_data = np.array(np.random.random_sample([1, 1]), dtype=np.int32) - - # Make model. - state_input = keras.layers.Input( - shape=(1,), name="state_input", dtype="int32") - output = keras.layers.Embedding( - output_dim=16, input_dim=100, input_length=1, name="state")( - state_input) - model = keras.models.Model(inputs=[state_input], outputs=[output]) - model.compile( - loss={"state": "sparse_categorical_crossentropy"}, optimizer="adam") - - # Get associated session. - sess = keras.backend.get_session() - variable_graph_def = sess.graph_def - output_tensor = [tensor.name.split(":")[0] for tensor in model.outputs] - constant_graph_def = graph_util.convert_variables_to_constants( - sess, variable_graph_def, output_tensor) - - # Ensure graph has no variables. - for node in constant_graph_def.node: - self.assertNotIn( - node.op, ["Variable", "VariableV2", "VarHandleOp", "ReadVariableOp"]) - - # Compare the value of the graphs. - expected_value = model.predict(input_data) - actual_value = self._evaluate_graph_def(constant_graph_def, model.inputs, - model.outputs, [input_data]) - np.testing.assert_almost_equal(np.array([expected_value]), actual_value, 5) - - def testConvertVariablesToConsts(self): - self._test_variable_to_const_conversion(use_resource=False) - - def testConvertResourceVariablesToConsts(self): - self._test_variable_to_const_conversion(use_resource=True) - def _test_variable_to_const_conversion(self, use_resource): with ops.Graph().as_default(): with variable_scope.variable_scope("", use_resource=use_resource): @@ -376,111 +404,119 @@ class DeviceFunctionsTest(test.TestCase): output = self.evaluate(output_node) self.assertNear(2.0, output, 0.00001) - def create_node_def(self, op, name, inputs): - new_node = node_def_pb2.NodeDef() - new_node.op = op - new_node.name = name - for input_name in inputs: - new_node.input.extend([input_name]) - return new_node + def _test_convert_variables_with_functions(self, inline_functions): + """Freezes a graph with functions.""" - def create_constant_node_def(self, name, value, dtype, - shape=None, inputs=None): - node = self.create_node_def("Const", name, inputs or []) - self.set_attr_dtype(node, "dtype", dtype) - self.set_attr_tensor(node, "value", value, dtype, shape) - return node + @function.Defun(dtypes.float32) + def plus_one(x): + return x + 1.0 - def set_attr_dtype(self, node, key, value): - node.attr[key].CopyFrom( - attr_value_pb2.AttrValue(type=value.as_datatype_enum)) + with ops.Graph().as_default(): + variable_node = variables.Variable(1.0, name="variable_node") + _ = variables.Variable(1.0, name="unused_variable_node") + defun_node = plus_one(variable_node) + _ = math_ops_lib.multiply(defun_node, 2.0, name="output_node") - def set_attr_tensor(self, node, key, value, dtype, shape=None): - node.attr[key].CopyFrom( - attr_value_pb2.AttrValue(tensor=tensor_util.make_tensor_proto( - value, dtype=dtype, shape=shape))) + with session.Session() as sess: + self.evaluate(variables.variables_initializer([variable_node])) + variable_graph_def = sess.graph.as_graph_def() - def testRemoveTrainingNodes(self): - a_constant_name = "a_constant" - b_constant_name = "b_constant" - a_check_name = "a_check" - b_check_name = "b_check" - a_identity_name = "a_identity" - b_identity_name = "b_identity" - add_name = "add" - graph_def = graph_pb2.GraphDef() - a_constant = self.create_constant_node_def( - a_constant_name, value=1, dtype=dtypes.float32, shape=[]) - graph_def.node.extend([a_constant]) - a_check_node = self.create_node_def("CheckNumerics", a_check_name, - [a_constant_name]) - graph_def.node.extend([a_check_node]) - a_identity_node = self.create_node_def( - "Identity", a_identity_name, [a_constant_name, "^" + a_check_name]) - graph_def.node.extend([a_identity_node]) - b_constant = self.create_constant_node_def( - b_constant_name, value=1, dtype=dtypes.float32, shape=[]) - graph_def.node.extend([b_constant]) - b_check_node = self.create_node_def("CheckNumerics", b_check_name, - [b_constant_name]) - graph_def.node.extend([b_check_node]) - b_identity_node = self.create_node_def( - "Identity", b_identity_name, [b_constant_name, "^" + b_check_name]) - graph_def.node.extend([b_identity_node]) - add_node = self.create_node_def("Add", add_name, - [a_identity_name, b_identity_name]) - self.set_attr_dtype(add_node, "T", dtypes.float32) - graph_def.node.extend([add_node]) + if inline_functions: + # Run Grappler to create the VarOpHandle --> Placeholder --> + # ResourceVariable pattern. + meta_graph = export_meta_graph(graph_def=variable_graph_def) + fetch_collection = meta_graph_pb2.CollectionDef() + for name in ["variable_node", "output_node"]: + fetch_collection.node_list.value.append(name) + meta_graph.collection_def["train_op"].CopyFrom(fetch_collection) - expected_output = graph_pb2.GraphDef() - a_constant = self.create_constant_node_def( - a_constant_name, value=1, dtype=dtypes.float32, shape=[]) - expected_output.node.extend([a_constant]) - b_constant = self.create_constant_node_def( - b_constant_name, value=1, dtype=dtypes.float32, shape=[]) - expected_output.node.extend([b_constant]) - add_node = self.create_node_def("Add", add_name, - [a_constant_name, b_constant_name]) - self.set_attr_dtype(add_node, "T", dtypes.float32) - expected_output.node.extend([add_node]) + # Initialize RewriterConfig with everything disabled except function + # inlining. + config = config_pb2.ConfigProto() + rewrite_options = config.graph_options.rewrite_options + rewrite_options.optimizers.append("function") + variable_graph_def = tf_optimizer.OptimizeGraph(config, meta_graph) - output = graph_util.remove_training_nodes(graph_def) - self.assertProtoEquals(expected_output, output) + constant_graph_def = graph_util.convert_variables_to_constants( + sess, variable_graph_def, ["output_node"]) - def testRemoveIdentityChains(self): - """Check that chains of Identity nodes are correctly pruned. + # Ensure there are no variables after freezing. + for node in constant_graph_def.node: + self.assertNotIn( + node.op, ["Variable", "VariableV2", "VarHandleOp", "ReadVariableOp"]) - Create a chain of four nodes, A, B, C, and D where A inputs B, B inputs C, - and C inputs D. Nodes B and C are "Identity" and should be pruned, resulting - in the nodes A and D, where A inputs D. - """ - graph_def = graph_pb2.GraphDef() - graph_def.node.extend([ - self.create_node_def("Aop", "A", ["B"]), self.create_node_def( - "Identity", "B", ["C"]), self.create_node_def( - "Identity", "C", ["D"]), self.create_node_def("Dop", "D", []) - ]) + def testReferenceVariables(self): + """Freezes a graph with reference variables.""" + self._test_variable_to_const_conversion(use_resource=False) - expected_graph_def = graph_pb2.GraphDef() - expected_graph_def.node.extend([ - self.create_node_def("Aop", "A", ["D"]), self.create_node_def( - "Dop", "D", []) - ]) + def testResourceVariables(self): + """Freezes a graph with resource variables.""" + self._test_variable_to_const_conversion(use_resource=True) - self.assertProtoEquals(expected_graph_def, - graph_util.remove_training_nodes(graph_def)) + def testWithFunctions(self): + """Freezes a graph with functions.""" + self._test_convert_variables_with_functions(inline_functions=False) - def testRemoveIdentityUsedAsControlInputInConst(self): - """Check that Identity nodes used as control inputs are not removed.""" - graph_def = graph_pb2.GraphDef() - graph_def.node.extend([ - self.create_constant_node_def("C", 1, dtypes.float32, inputs=["^I"]), - self.create_node_def("Identity", "I", ["Base"]), - self.create_node_def("BaseOp", "Base", []) - ]) + def testWithInlinedFunctions(self): + """Freezes a graph with functions that have been inlined using Grappler.""" + self._test_convert_variables_with_functions(inline_functions=True) - self.assertProtoEquals(graph_def, - graph_util.remove_training_nodes(graph_def)) + @test_util.run_v1_only("Incompatible with TF 2.0") + def testWithEmbeddings(self): + """Freezes a graph with embeddings.""" + input_data = np.array(np.random.random_sample([1, 1]), dtype=np.int32) + + # Make model. + state_input = keras.layers.Input( + shape=(1,), name="state_input", dtype="int32") + output = keras.layers.Embedding( + output_dim=16, input_dim=100, input_length=1, name="state")( + state_input) + model = keras.models.Model(inputs=[state_input], outputs=[output]) + model.compile( + loss={"state": "sparse_categorical_crossentropy"}, optimizer="adam") + + # Get associated session. + sess = keras.backend.get_session() + variable_graph_def = sess.graph_def + output_tensor = [tensor.name.split(":")[0] for tensor in model.outputs] + constant_graph_def = graph_util.convert_variables_to_constants( + sess, variable_graph_def, output_tensor) + + # Ensure graph has no variables. + for node in constant_graph_def.node: + self.assertNotIn( + node.op, ["Variable", "VariableV2", "VarHandleOp", "ReadVariableOp"]) + + # Compare the value of the graphs. + expected_value = model.predict(input_data) + actual_value = self._evaluate_graph_def(constant_graph_def, model.inputs, + model.outputs, [input_data]) + np.testing.assert_almost_equal(np.array([expected_value]), actual_value, 5) + + def testWithSwitch(self): + """Freezes a graph which contains a Switch with type RESOURCE_DT.""" + with ops.Graph().as_default(): + with variable_scope.variable_scope("", use_resource=True): + x = variable_scope.get_variable("var_x", initializer=1.0) + y = variable_scope.get_variable("var_y", initializer=2.0) + f1 = lambda: variable_scope.get_variable("var_f1", initializer=17.0) + f2 = lambda: variable_scope.get_variable("var_f2", initializer=23.0) + cond_node = control_flow_ops.case([(gen_math_ops.less(x, y), f1)], + default=f2) + _ = math_ops_lib.multiply(cond_node, 2.0, name="output_node") + + with session.Session() as sess: + sess.run(variables.global_variables_initializer()) + variable_graph_def = sess.graph.as_graph_def() + + constant_graph_def = graph_util.convert_variables_to_constants( + sess, variable_graph_def, ["output_node"]) + + # Ensure there are no variables after freezing. + for node in constant_graph_def.node: + self.assertNotIn( + node.op, ["Variable", "VariableV2", "VarHandleOp", "ReadVariableOp"]) if __name__ == "__main__": From 940509304b2f76134f76961813468b6b27d24d9e Mon Sep 17 00:00:00 2001 From: Karthik Muthuraman Date: Fri, 19 Jul 2019 13:54:46 -0700 Subject: [PATCH 0187/3053] added tests for reciprocal_no_nan(). --- tensorflow/python/ops/math_ops_test.py | 38 ++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py index 68740b67374..e7c7270c3af 100644 --- a/tensorflow/python/ops/math_ops_test.py +++ b/tensorflow/python/ops/math_ops_test.py @@ -699,5 +699,43 @@ class BinaryOpsTest(test_util.TensorFlowTestCase): a = array_ops.ones([1], dtype=dtypes.int32) + 1.0 self.evaluate(a) + +class ReciprocalNoNanTest(test_util.TensorFlowTestCase): + + allowed_dtypes = [dtypes.float16, dtypes.float32, dtypes.float64, + dtypes.complex64, dtypes.complex128] + + @test_util.run_in_graph_and_eager_modes + def testBasic(self): + for dtype in self.allowed_dtypes: + x = constant_op.constant([1.0, 2.0, 0.0, 4.0], dtype=dtype) + + y = math_ops.reciprocal_no_nan(x) + + target = constant_op.constant([1.0, 0.5, 0.0, 0.25], dtype=dtype) + + self.assertAllEqual(y, target) + self.assertEqual(y.dtype.base_dtype, target.dtype.base_dtype) + + def testInverse(self): + for dtype in self.allowed_dtypes: + x = np.random.choice([0, 1, 2, 4, 5], size=(5, 5, 5)) + x = constant_op.constant(x, dtype=dtype) + + y = math_ops.reciprocal_no_nan(math_ops.reciprocal_no_nan(x)) + + self.assertAllClose(y, x) + self.assertEqual(y.dtype.base_dtype, x.dtype.base_dtype) + + @test_util.run_in_graph_and_eager_modes + def testExceptionHandling(self): + for dtype in [dtypes.int8, dtypes.int16, dtypes.int32]: + x = constant_op.constant([1, 2, 0, 4], dtype=dtype) + try: + y = math_ops.reciprocal_no_nan(x) + except TypeError as te: + assert "incorrect data type" in str(te) + + if __name__ == "__main__": googletest.main() From 0d1930d025f5f03da8d90121541edada5706af3a Mon Sep 17 00:00:00 2001 From: Karthik Muthuraman Date: Fri, 19 Jul 2019 13:57:45 -0700 Subject: [PATCH 0188/3053] added test for reciprocal_no_nan(). --- tensorflow/python/ops/math_ops_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py index e7c7270c3af..f174d55e8eb 100644 --- a/tensorflow/python/ops/math_ops_test.py +++ b/tensorflow/python/ops/math_ops_test.py @@ -717,6 +717,7 @@ class ReciprocalNoNanTest(test_util.TensorFlowTestCase): self.assertAllEqual(y, target) self.assertEqual(y.dtype.base_dtype, target.dtype.base_dtype) + @test_util.run_in_graph_and_eager_modes def testInverse(self): for dtype in self.allowed_dtypes: x = np.random.choice([0, 1, 2, 4, 5], size=(5, 5, 5)) From 7c224e67caad31cbda3b493e979ae766489c2360 Mon Sep 17 00:00:00 2001 From: Billy Lamberta Date: Fri, 19 Jul 2019 13:54:31 -0700 Subject: [PATCH 0189/3053] Fixing page titles. PiperOrigin-RevId: 259032363 --- tensorflow/lite/g3doc/performance/delegates.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/g3doc/performance/delegates.md b/tensorflow/lite/g3doc/performance/delegates.md index cb6494dcbcd..b1ccb9ef072 100644 --- a/tensorflow/lite/g3doc/performance/delegates.md +++ b/tensorflow/lite/g3doc/performance/delegates.md @@ -1,4 +1,4 @@ -## TensorFlow Lite delegates +# TensorFlow Lite delegates _Note: Delegate API is still experimental and is subject to change._ From f14756c25c813bcc16375d9998efb45c27198e7e Mon Sep 17 00:00:00 2001 From: Thomas O'Malley Date: Fri, 19 Jul 2019 14:04:14 -0700 Subject: [PATCH 0190/3053] Fix callback_tests in single code path. Enable `histogram_freq` for DistributionStrategy. PiperOrigin-RevId: 259034447 --- tensorflow/python/keras/callbacks_test.py | 6 ------ .../keras/distribute/distributed_training_utils.py | 13 +++---------- 2 files changed, 3 insertions(+), 16 deletions(-) diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py index 4cb70bbbaa7..f072384d09f 100644 --- a/tensorflow/python/keras/callbacks_test.py +++ b/tensorflow/python/keras/callbacks_test.py @@ -1345,8 +1345,6 @@ class TestTensorBoardV2(keras_parameterized.TestCase): See: """ - if testing_utils.should_run_distributed(): - self.skipTest('b/137397816') model = self._get_model() x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1)) tb_cbk = keras.callbacks.TensorBoard(self.logdir) @@ -1410,8 +1408,6 @@ class TestTensorBoardV2(keras_parameterized.TestCase): ) def test_TensorBoard_weight_histograms(self): - if testing_utils.should_run_distributed(): - self.skipTest('b/137397816') model = self._get_model() x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1)) tb_cbk = keras.callbacks.TensorBoard(self.logdir, histogram_freq=1) @@ -1442,8 +1438,6 @@ class TestTensorBoardV2(keras_parameterized.TestCase): ) def test_TensorBoard_weight_images(self): - if testing_utils.should_run_distributed(): - self.skipTest('b/137397816') model = self._get_model() x, y = np.ones((10, 10, 10, 1)), np.ones((10, 1)) tb_cbk = keras.callbacks.TensorBoard( diff --git a/tensorflow/python/keras/distribute/distributed_training_utils.py b/tensorflow/python/keras/distribute/distributed_training_utils.py index df47b5f8ea5..1f484ae7551 100644 --- a/tensorflow/python/keras/distribute/distributed_training_utils.py +++ b/tensorflow/python/keras/distribute/distributed_training_utils.py @@ -211,8 +211,8 @@ def validate_callbacks(input_callbacks, optimizer): Raises: ValueError: If `LearningRateScheduler` or `ReduceLROnPlateau` is one of the callbacks passed. - ValueError: If `histogram_freq` or `write_grads` is one of the parameters - passed as part of the TensorBoard callback. + ValueError: If `write_grads` is one of the parameters passed as part of the + TensorBoard callback. """ if input_callbacks: for callback in input_callbacks: @@ -227,20 +227,13 @@ def validate_callbacks(input_callbacks, optimizer): # features of the callback that involve accessing model attributes and # running ops. if isinstance(callback, callbacks.TensorBoard): - if getattr(callback, 'histogram_freq', False): - logging.warning( - UserWarning( - '`histogram_freq` in the TensorBoard callback is not ' - 'supported when using DistributionStrategy. Setting ' - '`histogram_freq` to `0`.')) - callback.histogram_freq = 0 if getattr(callback, 'write_grads', False): logging.warning( UserWarning( '`write_grads` in the TensorBoard callback is not supported ' 'when using DistributionStrategy. Setting `write_grads` ' 'to `False`.')) - callback.histogram_freq = False + callback.write_grads = False def validate_distributed_dataset_inputs(distribution_strategy, x, y, From d7cb6d0a3febf7893f92a84ef53c82928faeafaf Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 19 Jul 2019 14:12:27 -0700 Subject: [PATCH 0191/3053] - Disable tuning on Apple - we don't want to use an in-order-tuned kernel on an Apple CPU. We shouldn't even with tuning, as Apple CPUs are out-of-order, but we don't want to risk the case of misdetection by the tuning nanobenchmark. - Whenever tuning is not enabled, have the tuning resolver just return without even the overhead of querying a timestamp. PiperOrigin-RevId: 259036253 --- tensorflow/lite/experimental/ruy/platform.h | 7 +++++++ tensorflow/lite/experimental/ruy/tune.cc | 13 ++++++------- tensorflow/lite/experimental/ruy/tune.h | 13 +++++++++++++ tensorflow/lite/experimental/ruy/tune_test.cc | 2 ++ 4 files changed, 28 insertions(+), 7 deletions(-) diff --git a/tensorflow/lite/experimental/ruy/platform.h b/tensorflow/lite/experimental/ruy/platform.h index 13eccf8acf6..29c0fc20784 100644 --- a/tensorflow/lite/experimental/ruy/platform.h +++ b/tensorflow/lite/experimental/ruy/platform.h @@ -49,4 +49,11 @@ limitations under the License. #define RUY_DONOTUSEDIRECTLY_NEON_64 \ (RUY_DONOTUSEDIRECTLY_NEON && RUY_DONOTUSEDIRECTLY_ARM_64) +// Detect APPLE +#ifdef __APPLE__ +#define RUY_DONOTUSEDIRECTLY_APPLE 1 +#else +#define RUY_DONOTUSEDIRECTLY_APPLE 0 +#endif + #endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PLATFORM_H_ diff --git a/tensorflow/lite/experimental/ruy/tune.cc b/tensorflow/lite/experimental/ruy/tune.cc index d2ca263e706..58a956e03cc 100644 --- a/tensorflow/lite/experimental/ruy/tune.cc +++ b/tensorflow/lite/experimental/ruy/tune.cc @@ -18,13 +18,11 @@ limitations under the License. #include #include -#include "tensorflow/lite/experimental/ruy/opt_set.h" -#include "tensorflow/lite/experimental/ruy/platform.h" #include "tensorflow/lite/experimental/ruy/time.h" namespace ruy { -#if RUY_PLATFORM(NEON_64) +#ifdef RUY_IMPLEMENT_TUNING namespace { @@ -131,7 +129,7 @@ Tuning TuningResolver::ResolveNow() { return is_probably_inorder ? Tuning::kInOrder : Tuning::kOutOfOrder; } -#else // not RUY_PLATFORM(NEON_64) +#else // not defined RUY_IMPLEMENT_TUNING float TuningResolver::EvalRatio() { return 0; } float TuningResolver::ThresholdRatio() { return 0; } @@ -146,9 +144,7 @@ TuningResolver::TuningResolver() : expiry_duration_(DurationFromSeconds(kExpirySecs)) {} Tuning TuningResolver::Resolve() { -#if !RUY_OPT_ENABLED(RUY_OPT_TUNING) - return Tuning::kOutOfOrder; -#endif +#ifdef RUY_IMPLEMENT_TUNING if (unresolved_tuning_ != Tuning::kAuto) { return unresolved_tuning_; } @@ -160,6 +156,9 @@ Tuning TuningResolver::Resolve() { last_resolved_timepoint_ = new_timepoint; last_resolved_tuning_ = ResolveNow(); return last_resolved_tuning_; +#else + return Tuning::kOutOfOrder; +#endif } } // namespace ruy diff --git a/tensorflow/lite/experimental/ruy/tune.h b/tensorflow/lite/experimental/ruy/tune.h index c1b95842b87..a1d0eb9ae40 100644 --- a/tensorflow/lite/experimental/ruy/tune.h +++ b/tensorflow/lite/experimental/ruy/tune.h @@ -74,8 +74,21 @@ limitations under the License. #include +#include "tensorflow/lite/experimental/ruy/opt_set.h" +#include "tensorflow/lite/experimental/ruy/platform.h" #include "tensorflow/lite/experimental/ruy/time.h" +// Tuning only implemented on NEON_64 at the moment (see assembly code +// in the nano-benchmark) and not on Apple (some Apple CPUs produce incorrect +// results on in-order-tuned kernels combining ARM and NEON load instructions +// and NEON `ins` instructions). +// +// When tuning is not implemented, we simply always use Tuning::kOutOfOrder. +#if RUY_OPT_ENABLED(RUY_OPT_TUNING) && RUY_PLATFORM(NEON_64) && \ + !RUY_PLATFORM(APPLE) +#define RUY_IMPLEMENT_TUNING +#endif + namespace ruy { enum class Tuning { diff --git a/tensorflow/lite/experimental/ruy/tune_test.cc b/tensorflow/lite/experimental/ruy/tune_test.cc index 571c2189e81..051c34910b6 100644 --- a/tensorflow/lite/experimental/ruy/tune_test.cc +++ b/tensorflow/lite/experimental/ruy/tune_test.cc @@ -33,6 +33,7 @@ TEST(TuneTest, TuneTest) { tuning_resolver.SetTuning(Tuning::kAuto); +#ifdef RUY_IMPLEMENT_TUNING for (auto tuning : {Tuning::kOutOfOrder, Tuning::kInOrder}) { tuning_resolver.SetTuning(tuning); ASSERT_TRUE(tuning_resolver.Resolve() == tuning); @@ -40,6 +41,7 @@ TEST(TuneTest, TuneTest) { std::this_thread::sleep_for(std::chrono::seconds(1)); ASSERT_TRUE(tuning_resolver.Resolve() == tuning); } +#endif } } // namespace From 5556dd6890a055bbc1534d96640be43eb2d3399f Mon Sep 17 00:00:00 2001 From: Bhavani Subramanian Date: Fri, 19 Jul 2019 14:17:38 -0700 Subject: [PATCH 0192/3053] Changed CHECK to DCHECK --- tensorflow/core/util/mkl_util.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index 65aca5ab10d..e8b083e22a8 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -1060,7 +1060,7 @@ inline memory::dims TFShapeToMklDnnDims(const TensorShape& shape) { inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape, TensorFormat format) { // Check validity of format. - CHECK_NE(TFDataFormatToMklDnnDataFormat(format), MKL_TENSOR_FORMAT_INVALID); + DCHECK_NE(TFDataFormatToMklDnnDataFormat(format), MKL_TENSOR_FORMAT_INVALID); int n = shape.dim_size(GetTensorDimIndex(format, 'N')); int c = shape.dim_size(GetTensorDimIndex(format, 'C')); @@ -1074,7 +1074,7 @@ inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape, inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape, TensorFormat format) { // Validate format. - CHECK_NE(TFDataFormatToMklDnn3DDataFormat(format), MKL_TENSOR_FORMAT_INVALID); + DCHECK_NE(TFDataFormatToMklDnn3DDataFormat(format), MKL_TENSOR_FORMAT_INVALID); int n = shape.dim_size(GetTensorDimIndex<3>(format, 'N')); int c = shape.dim_size(GetTensorDimIndex<3>(format, 'C')); @@ -1091,7 +1091,7 @@ inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape, inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims, TensorFormat format) { // Validate format. - CHECK_NE(TFDataFormatToMklDnnDataFormat(format), MKL_TENSOR_FORMAT_INVALID); + DCHECK_NE(TFDataFormatToMklDnnDataFormat(format), MKL_TENSOR_FORMAT_INVALID); int n = in_dims[GetTensorDimIndex(format, 'N')]; int c = in_dims[GetTensorDimIndex(format, 'C')]; From c93807a7a4a8c2c8207efcf5de36bee28f8407e5 Mon Sep 17 00:00:00 2001 From: Trent Lo Date: Fri, 19 Jul 2019 14:33:42 -0700 Subject: [PATCH 0193/3053] Fixing a compilation error. Some compilers disallow passing const_iterator to std::vector::erase() (while some allow). --- tensorflow/core/common_runtime/bfc_allocator.cc | 9 +++++++-- tensorflow/core/common_runtime/bfc_allocator.h | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc index 0d4dbb3cee4..3220851c8cb 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.cc +++ b/tensorflow/core/common_runtime/bfc_allocator.cc @@ -319,8 +319,13 @@ bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes) { void BFCAllocator::DeallocateRegions( const absl::flat_hash_set& region_ptrs) { - auto it = region_manager_.regions().begin(); - while (it != region_manager_.regions().end()) { + // Explicitly remove the const qualifier as some compilers disallow passing + // const_iterator to std::vector::erase(), which is used in + // RemoveAllocationRegion(). + auto regions = + const_cast*>(®ion_manager_.regions()); + auto it = regions->begin(); + while (it != regions->end()) { if (!region_ptrs.contains(it->ptr())) { ++it; continue; diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h index f3d922f342b..f3f31441bbc 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.h +++ b/tensorflow/core/common_runtime/bfc_allocator.h @@ -311,8 +311,8 @@ class BFCAllocator : public Allocator { regions_.insert(entry, AllocationRegion(ptr, memory_size)); } - std::vector::const_iterator RemoveAllocationRegion( - std::vector::const_iterator it) { + std::vector::iterator RemoveAllocationRegion( + std::vector::iterator it) { return regions_.erase(it); } From c1e9307281ecaa2679ffbb54a26b84c47a3f2cb6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 19 Jul 2019 14:35:31 -0700 Subject: [PATCH 0194/3053] Metal convolution unit tests added. PiperOrigin-RevId: 259040457 --- .../lite/delegates/gpu/metal/kernels/BUILD | 22 ++ .../delegates/gpu/metal/kernels/conv_test.mm | 243 ++++++++++++++++++ 2 files changed, 265 insertions(+) create mode 100644 tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD index 467bb1d2012..4df787c80dc 100644 --- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD +++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD @@ -119,6 +119,28 @@ cc_library( ], ) +objc_library( + name = "conv_test_lib", + testonly = 1, + srcs = ["conv_test.mm"], + sdk_frameworks = ["XCTest"], + deps = [ + ":conv", + ":test_util", + ], +) + +ios_unit_test( + name = "conv_test", + testonly = 1, + minimum_os_version = "9.0", + tags = [ + "notap", + "tflite_not_portable_android", + ], + deps = [":conv_test_lib"], +) + cc_library( name = "depthwise_conv", srcs = ["depthwise_conv.cc"], diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm new file mode 100644 index 00000000000..b9cbd65620d --- /dev/null +++ b/tensorflow/lite/delegates/gpu/metal/kernels/conv_test.mm @@ -0,0 +1,243 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/delegates/gpu/metal/kernels/add.h" + +#import + +#include + +#include "tensorflow/lite/delegates/gpu/common/operations.h" +#include "tensorflow/lite/delegates/gpu/common/shape.h" +#include "tensorflow/lite/delegates/gpu/common/status.h" +#include "tensorflow/lite/delegates/gpu/common/tensor.h" +#include "tensorflow/lite/delegates/gpu/common/util.h" +#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h" +#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h" +#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h" + +using ::tflite::gpu::Axis; +using ::tflite::gpu::Convolution2DAttributes; +using ::tflite::gpu::DataType; +using ::tflite::gpu::BHWC; +using ::tflite::gpu::HW; +using ::tflite::gpu::Linear; +using ::tflite::gpu::metal::CompareVectors; +using ::tflite::gpu::metal::SingleOpModel; +using ::tflite::gpu::OperationType; +using ::tflite::gpu::OHWI; +using ::tflite::gpu::Tensor; +using ::tflite::gpu::TensorRef; + +@interface ConvTest : XCTestCase +@end + +@implementation ConvTest +- (void)setUp { + [super setUp]; +} + +- (void)testO2H2W1I1Stride1x1Dilation1x1 { + TensorRef input; + input.type = DataType::FLOAT32; + input.ref = 0; + input.shape = BHWC(1, 2, 2, 1); + + Convolution2DAttributes attr; + Tensor bias; + bias.shape.v = 2; + bias.id = 1; + bias.data = {1, 1}; + attr.bias = std::move(bias); + + Tensor weights; + weights.shape = OHWI(2, 2, 1, 1); + weights.id = 2; + weights.data = {1, 2, 3, 4}; + attr.weights = std::move(weights); + + attr.dilations = HW(1, 1); + attr.padding.prepended = HW(0, 0); + attr.padding.appended = HW(1, 0); + attr.strides = HW(1, 1); + + TensorRef output; + output.type = DataType::FLOAT32; + output.ref = 3; + output.shape = BHWC(1, 2, 2, 2); + + SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input}, + {output}); + XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({4, 8, 4, 8, 2, 4, 2, 4}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testO1H2W2I1Stride1x1Dilation2x2 { + TensorRef input; + input.type = DataType::FLOAT32; + input.ref = 0; + input.shape = BHWC(1, 3, 3, 1); + + Convolution2DAttributes attr; + Tensor bias; + bias.shape.v = 2; + bias.id = 1; + bias.data.push_back(0.0); + attr.bias = std::move(bias); + + Tensor weights; + weights.shape = OHWI(1, 2, 2, 1); + weights.id = 2; + weights.data = {1, 2, 3, 4}; + attr.weights = std::move(weights); + + attr.dilations = HW(2, 2); + attr.padding.prepended = HW(0, 0); + attr.padding.appended = HW(0, 0); + attr.strides = HW(1, 1); + + TensorRef output; + output.type = DataType::FLOAT32; + output.ref = 3; + output.shape = BHWC(1, 1, 1, 1); + + SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input}, + {output}); + XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1, 1, 1, 1, 1, 1})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({10}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testO1H3W3I1Stride1x1Dilation1x1 { + TensorRef input; + input.type = DataType::FLOAT32; + input.ref = 0; + input.shape = BHWC(1, 2, 2, 1); + + Convolution2DAttributes attr; + Tensor bias; + bias.shape.v = 1; + bias.id = 1; + bias.data.push_back(1.0); + attr.bias = std::move(bias); + + Tensor weights; + weights.shape = OHWI(1, 3, 3, 1); + weights.id = 2; + weights.data = {1, 2, 3, 1, 2, 3, 1, 2, 3}; + attr.weights = std::move(weights); + + attr.dilations = HW(1, 1); + attr.padding.prepended = HW(1, 1); + attr.padding.appended = HW(0, 0); + attr.strides = HW(1, 1); + + TensorRef output; + output.type = DataType::FLOAT32; + output.ref = 3; + output.shape = BHWC(1, 1, 1, 1); + + SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input}, + {output}); + XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({11}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testO2H1W1I2Stride1x1Dilation1x1 { + TensorRef input; + input.type = DataType::FLOAT32; + input.ref = 0; + input.shape = BHWC(1, 2, 1, 2); + + Convolution2DAttributes attr; + Tensor bias; + bias.shape.v = 2; + bias.id = 1; + bias.data = {1, 1}; + attr.bias = std::move(bias); + + Tensor weights; + weights.shape = OHWI(2, 1, 1, 2); + weights.id = 2; + weights.data = {1, 2, 3, 4}; + attr.weights = std::move(weights); + + attr.dilations = HW(1, 1); + attr.padding.prepended = HW(0, 0); + attr.padding.appended = HW(0, 0); + attr.strides = HW(1, 1); + + TensorRef output; + output.type = DataType::FLOAT32; + output.ref = 3; + output.shape = BHWC(1, 2, 1, 2); + + SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input}, + {output}); + XCTAssertTrue(model.PopulateTensor(0, {1, 1, 1, 1})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({4, 8, 4, 8}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testO1H1W1I1Stride2x2Dilation1x1 { + TensorRef input; + input.type = DataType::FLOAT32; + input.ref = 0; + input.shape = BHWC(1, 3, 3, 1); + + Convolution2DAttributes attr; + Tensor bias; + bias.shape.v = 2; + bias.id = 1; + bias.data.push_back(0.0); + attr.bias = std::move(bias); + + Tensor weights; + weights.shape = OHWI(1, 1, 1, 1); + weights.id = 2; + weights.data.push_back(2.0); + + attr.weights = std::move(weights); + + attr.dilations = HW(1, 1); + attr.padding.prepended = HW(0, 0); + attr.padding.appended = HW(0, 0); + attr.strides = HW(2, 2); + + TensorRef output; + output.type = DataType::FLOAT32; + output.ref = 3; + output.shape = BHWC(1, 2, 2, 1); + + SingleOpModel model({ToString(OperationType::CONVOLUTION_2D), std::move(attr)}, {input}, + {output}); + XCTAssertTrue(model.PopulateTensor(0, {1, 0, 2, 0, 0, 0, 4, 0, 8})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({2, 4, 8, 16}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +@end From d5cc8288d939b522982738370facd456a0a643ba Mon Sep 17 00:00:00 2001 From: Jared Duke Date: Fri, 19 Jul 2019 14:54:17 -0700 Subject: [PATCH 0195/3053] Allow mfcc operator to take scalar rate input PiperOrigin-RevId: 259044058 --- tensorflow/lite/kernels/mfcc.cc | 29 ++++++++++++++-------------- tensorflow/lite/kernels/mfcc_test.cc | 25 ++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 14 deletions(-) diff --git a/tensorflow/lite/kernels/mfcc.cc b/tensorflow/lite/kernels/mfcc.cc index f5b0212728e..da172bb4827 100644 --- a/tensorflow/lite/kernels/mfcc.cc +++ b/tensorflow/lite/kernels/mfcc.cc @@ -67,19 +67,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); - const TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav); - const TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate); + const TfLiteTensor* input_wav = GetInput(context, node, kInputTensorWav); + const TfLiteTensor* input_rate = GetInput(context, node, kInputTensorRate); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - TF_LITE_ENSURE_EQ(context, NumDimensions(inputWav), 3); - TF_LITE_ENSURE_EQ(context, NumDimensions(inputRate), 1); + TF_LITE_ENSURE_EQ(context, NumDimensions(input_wav), 3); + TF_LITE_ENSURE_EQ(context, NumElements(input_rate), 1); TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32); - TF_LITE_ENSURE_EQ(context, inputWav->type, output->type); + TF_LITE_ENSURE_EQ(context, input_wav->type, output->type); + TF_LITE_ENSURE_EQ(context, input_rate->type, kTfLiteInt32); TfLiteIntArray* output_size = TfLiteIntArrayCreate(3); - output_size->data[0] = inputWav->dims->data[0]; - output_size->data[1] = inputWav->dims->data[1]; + output_size->data[0] = input_wav->dims->data[0]; + output_size->data[1] = input_wav->dims->data[1]; output_size->data[2] = params->dct_coefficient_count; return context->ResizeTensor(context, output, output_size); @@ -94,15 +95,15 @@ template TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { auto* params = reinterpret_cast(node->user_data); - const TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav); - const TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate); + const TfLiteTensor* input_wav = GetInput(context, node, kInputTensorWav); + const TfLiteTensor* input_rate = GetInput(context, node, kInputTensorRate); TfLiteTensor* output = GetOutput(context, node, kOutputTensor); - const int32 sample_rate = *GetTensorData(inputRate); + const int32 sample_rate = *GetTensorData(input_rate); - const int spectrogram_channels = inputWav->dims->data[2]; - const int spectrogram_samples = inputWav->dims->data[1]; - const int audio_channels = inputWav->dims->data[0]; + const int spectrogram_channels = input_wav->dims->data[2]; + const int spectrogram_samples = input_wav->dims->data[1]; + const int audio_channels = input_wav->dims->data[0]; internal::Mfcc mfcc; mfcc.set_upper_frequency_limit(params->upper_frequency_limit); @@ -112,7 +113,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { mfcc.Initialize(spectrogram_channels, sample_rate); - const float* spectrogram_flat = GetTensorData(inputWav); + const float* spectrogram_flat = GetTensorData(input_wav); float* output_flat = GetTensorData(output); for (int audio_channel = 0; audio_channel < audio_channels; ++audio_channel) { diff --git a/tensorflow/lite/kernels/mfcc_test.cc b/tensorflow/lite/kernels/mfcc_test.cc index 7b5591b3b67..99dcc3c8a72 100644 --- a/tensorflow/lite/kernels/mfcc_test.cc +++ b/tensorflow/lite/kernels/mfcc_test.cc @@ -92,6 +92,31 @@ TEST(MfccOpTest, SimpleTest) { 1e-3))); } +TEST(MfccOpTest, ScalarInputRateTest) { + BaseMfccOpModel m({TensorType_FLOAT32, {1, 1, 513}}, {TensorType_INT32, {}}, + {TensorType_FLOAT32, {}}); + + std::vector data(513); + for (int i = 0; i < data.size(); ++i) { + data[i] = i + 1; + } + m.PopulateTensor(m.input1(), 0, data.data(), + data.data() + data.size()); + m.PopulateTensor(m.input2(), {22050}); + + m.Invoke(); + + std::vector output_shape = m.GetOutputShape(); + EXPECT_THAT(output_shape, ElementsAre(1, 1, 13)); + EXPECT_THAT( + m.GetOutput(), + ElementsAreArray(ArrayFloatNear( + {29.13970072, -6.41568601, -0.61903012, -0.96778652, -0.26819878, + -0.40907028, -0.15614748, -0.23203119, -0.10481487, -0.1543029, + -0.0769791, -0.10806114, -0.06047613}, + 1e-3))); +} + } // namespace } // namespace custom } // namespace ops From 150367468c91720f5283de22d24a31174d582a0d Mon Sep 17 00:00:00 2001 From: Sundeep Gottipati Date: Fri, 19 Jul 2019 15:06:05 -0700 Subject: [PATCH 0196/3053] Implement __lt__ method on FeatureColumn base class so that they are sortable in Python 3. PiperOrigin-RevId: 259046547 --- .../python/feature_column/feature_column.py | 19 +++++++++++++++++++ .../feature_column/feature_column_v2.py | 19 +++++++++++++++++++ .../feature_column/feature_column_v2_test.py | 13 +++++++++++++ 3 files changed, 51 insertions(+) diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py index cf3043ec7bb..640561f4995 100644 --- a/tensorflow/python/feature_column/feature_column.py +++ b/tensorflow/python/feature_column/feature_column.py @@ -1757,6 +1757,25 @@ class _FeatureColumn(object): """Returns string. Used for naming and for name_scope.""" pass + def __lt__(self, other): + """Allows feature columns to be sortable in Python 3 as they are in 2. + + Feature columns need to occasionally be sortable, for example when used as + keys in a features dictionary passed to a layer. + + `__lt__` is the only method needed for sorting in CPython: + https://docs.python.org/3/library/stdtypes.html#list.sort + + Args: + other: The other object to compare to. + + Returns: + True if the string representation of this object is lexicographically less + than the string representation of `other`. For FeatureColumn objects, + this looks like "<__main__.FeatureColumn object at 0x7fa1fc02bba8>". + """ + return str(self) < str(other) + @property def _var_scope_name(self): """Returns string. Used for variable_scope. Defaults to self.name.""" diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py index a9d0fa2e906..96a08141076 100644 --- a/tensorflow/python/feature_column/feature_column_v2.py +++ b/tensorflow/python/feature_column/feature_column_v2.py @@ -2197,6 +2197,25 @@ class FeatureColumn(object): """Returns string. Used for naming.""" pass + def __lt__(self, other): + """Allows feature columns to be sortable in Python 3 as they are in 2. + + Feature columns need to occasionally be sortable, for example when used as + keys in a features dictionary passed to a layer. + + `__lt__` is the only method needed for sorting in CPython: + https://docs.python.org/3/library/stdtypes.html#list.sort + + Args: + other: The other object to compare to. + + Returns: + True if the string representation of this object is lexicographically less + than the string representation of `other`. For FeatureColumn objects, + this looks like "<__main__.FeatureColumn object at 0x7fa1fc02bba8>". + """ + return str(self) < str(other) + @abc.abstractmethod def transform_feature(self, transformation_cache, state_manager): """Returns intermediate representation (usually a `Tensor`). diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py index f56c01bd198..528f8fec83e 100644 --- a/tensorflow/python/feature_column/feature_column_v2_test.py +++ b/tensorflow/python/feature_column/feature_column_v2_test.py @@ -89,6 +89,19 @@ class BaseFeatureColumnForTests(fc.FeatureColumn): raise ValueError('Should not use this method.') +class SortableFeatureColumnTest(test.TestCase): + + def test_sort_columns_by_name(self): + # These should be sorted lexicographically based on their string + # representations. For FeatureColumns, this looks like + # '<__main__.FeatureColumn object at ...>'. + + a = fc.numeric_column('first') # '<__main__.NumericColumn object at 0xa>' + b = fc.numeric_column('second') # '<__main__.NumericColumn object at 0xb>' + c = fc_old._numeric_column('third') # '<__main__._NumericColumn ...>' + self.assertAllEqual(sorted(['d', c, b, a]), [a, b, c, 'd']) + + class LazyColumnTest(test.TestCase): def test_transformations_called_once(self): From b87f3de207e7184471b0facefffeaa2b9409c0c4 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 19 Jul 2019 15:14:01 -0700 Subject: [PATCH 0197/3053] Prune Dequantize nodes in GPU delegate when next op is replaceable. PiperOrigin-RevId: 259047903 --- .../delegates/gpu/common/model_builder.cc | 32 +++---- .../gpu/common/model_builder_test.cc | 89 ++++++++++++++++--- 2 files changed, 90 insertions(+), 31 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc index 9a89c0df9b9..a987c274a75 100644 --- a/tensorflow/lite/delegates/gpu/common/model_builder.cc +++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc @@ -2220,9 +2220,7 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) { return nullptr; } TfLiteIntArray* subgraph = TfLiteIntArrayCreate(execution_plan->size); - std::vector pruned_graph; subgraph->size = 0; - // pruned_graph will not include dequantize operations. std::set errors; // Map the output tensor of a Dequantize nodes to its input tensor. @@ -2241,31 +2239,23 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) { TfLiteType::kTfLiteFloat16) { // Record the output->input mapping for the op. node_map[node->outputs->data[0]] = node->inputs->data[0]; - } else { - // Fix the node's inputs. + continue; + } + status = IsSupported(context, node, registration); + if (status.ok() && + // TODO(eignasheva): resolve sub operation support for metal delegate + // registration->builtin_code != kTfLiteBuiltinSub && + IsAllFloatTensors(context, node->inputs) && + IsAllFloatTensors(context, node->outputs)) { + // Fix the node's inputs (i.e. prune out the preceding dequantize node) + // if the op is supported. TfLiteIntArray* inputs = node->inputs; for (int j = 0; j < inputs->size; ++j) { if (node_map.find(inputs->data[j]) != node_map.end()) { inputs->data[j] = node_map[inputs->data[j]]; } } - // Add the op to the graph. - pruned_graph.push_back(i); - } - } - - for (int i = 0; i < pruned_graph.size(); ++i) { - TfLiteNode* node = nullptr; - TfLiteRegistration* registration = nullptr; - GetNodeAndRegistration(context, pruned_graph[i], &node, ®istration) - .IgnoreError(); - const auto status = IsSupported(context, node, registration); - if (status.ok() && - // TODO(eignasheva): resolve sub operation support for metal delegate - // registration->builtin_code != kTfLiteBuiltinSub && - IsAllFloatTensors(context, node->inputs) && - IsAllFloatTensors(context, node->outputs)) { - if (errors.empty()) subgraph->data[subgraph->size++] = pruned_graph[i]; + if (errors.empty()) subgraph->data[subgraph->size++] = i; } else { errors.insert(GetOpNameByRegistration(registration) + ": " + status.error_message()); diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc index 1f182b2e41d..31c7c570867 100644 --- a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc +++ b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc @@ -122,7 +122,7 @@ TEST(ModelBuilderTest, ConvertTfLiteTensorToTensorRefFailsForRankGT3) { class InterpreterFp16 { public: - InterpreterFp16() { + explicit InterpreterFp16(TfLiteBuiltinOperator op) { void* builtin_data = malloc(sizeof(int)); EXPECT_EQ(interpreter_.AddTensors(5), kTfLiteOk); EXPECT_EQ(interpreter_.SetInputs({0, 1}), kTfLiteOk); @@ -147,7 +147,7 @@ class InterpreterFp16 { kTfLiteOk); // Add a node that GPU delegate can parse. - const TfLiteRegistration reg_add0 = { + const TfLiteRegistration reg_op0 = { [](TfLiteContext* context, const char* buffer, size_t length) { return reinterpret_cast(new int(1)); }, @@ -157,15 +157,16 @@ class InterpreterFp16 { nullptr, nullptr, nullptr, - kTfLiteBuiltinAdd}; + op}; EXPECT_EQ(interpreter_.AddNodeWithParameters( /*inputs=*/{1, 3}, /*outputs=*/{4}, /*init_data=*/nullptr, /*init_data_size=*/0, /*builtin_data=*/builtin_data, - /*registration=*/®_add0), + /*registration=*/®_op0), kTfLiteOk); - // Set inputs to Dequantize node to the specified type. + // Set inputs to Dequantize node to the fp16 type, and outputs + // to fp32 type. const std::vector dims = {1}; TfLiteQuantization quantization; quantization.type = kTfLiteNoQuantization; @@ -177,6 +178,15 @@ class InterpreterFp16 { interpreter_.SetTensorParametersReadWrite( 2, TfLiteType::kTfLiteFloat16, "t2", dims, quantization, false), kTfLiteOk); + EXPECT_EQ( + interpreter_.SetTensorParametersReadWrite( + 1, TfLiteType::kTfLiteFloat32, "t1", dims, quantization, false), + kTfLiteOk); + EXPECT_EQ( + interpreter_.SetTensorParametersReadWrite( + 3, TfLiteType::kTfLiteFloat32, "t3", dims, quantization, false), + kTfLiteOk); + exec_plan_ = TfLiteIntArrayCreate(3); exec_plan_->data[0] = 0; exec_plan_->data[1] = 1; @@ -193,7 +203,8 @@ class InterpreterFp16 { TfLiteIntArray* exec_plan_; }; -InterpreterFp16* interpreter_fp16 = new InterpreterFp16(); +InterpreterFp16* interpreter_fp16_add_op = + new InterpreterFp16(kTfLiteBuiltinAdd); TEST(ModelBuilderTest, GetOpsToReplacePrunesFp16DequantizeNodes) { // Before pruning, the graph has three nodes: @@ -206,19 +217,19 @@ TEST(ModelBuilderTest, GetOpsToReplacePrunesFp16DequantizeNodes) { // t0 (FP16) --> Add -> t4 // t2 (FP16) --/ // - TfLiteContext* context = interpreter_fp16->GetSubgraph()->context(); + TfLiteContext* context = interpreter_fp16_add_op->GetSubgraph()->context(); // These functions are meant to be called inside delegates. Swap out // for similar functions to permit direct calling of GetOpsToReplace. context->GetExecutionPlan = [](struct TfLiteContext* context, TfLiteIntArray** execution_plan) { - *execution_plan = interpreter_fp16->exec_plan(); + *execution_plan = interpreter_fp16_add_op->exec_plan(); return kTfLiteOk; }; context->GetNodeAndRegistration = [](struct TfLiteContext*, int node_index, TfLiteNode** node, TfLiteRegistration** registration) { - auto& node_and_reg = - interpreter_fp16->GetSubgraph()->nodes_and_registration()[node_index]; + auto& node_and_reg = interpreter_fp16_add_op->GetSubgraph() + ->nodes_and_registration()[node_index]; *node = &node_and_reg.first; *registration = &node_and_reg.second; return kTfLiteOk; @@ -239,6 +250,64 @@ TEST(ModelBuilderTest, GetOpsToReplacePrunesFp16DequantizeNodes) { TfLiteIntArrayFree(ops_to_replace); } +// This interpreter instance is created at global scope to test *exactly* +// the GetOpsToReplace function alone, and not the sequence of function calls +// that includes GetOpsToReplace when calling ModifyGraphWithDelegate. +// A TfLiteContext is needed to test GetOpsToReplace, but TfLiteContexts +// intentionally make it difficult to call certain functions in a +// non-delegate context (see tensorflow/lite/subgraph/subgraph.cc for details) +// We create our own GetExecutionPlan and GetNodeAndRegistration lambdas +// inside each test, but we can't use local captures without changing the +// function signature. Therefore, this test data lives at global scope +// in order to be accessible inside the lambda. + +InterpreterFp16* interpreter_fp16_gt_op = + new InterpreterFp16(kTfLiteBuiltinGreater); + +TEST(ModelBuilderTest, GetOpsToReplaceKeepsFp16DequantizeNodes) { + // Before pruning, the graph has three nodes: + // + // t0 (FP16) -> DequantNode -> t1 (FP32) -> Greater Op -> t4 + // t2 (FP16) -> DequantNode -> t3 (FP32) --/ + // + // Because there is no GPU equivalent for the Greater op, we don't prune + // the Dequantize nodes. + + TfLiteContext* context = interpreter_fp16_gt_op->GetSubgraph()->context(); + // These functions are meant to be called inside delegates. Swap out + // for similar functions to permit direct calling of GetOpsToReplace. + context->GetExecutionPlan = [](struct TfLiteContext* context, + TfLiteIntArray** execution_plan) { + *execution_plan = interpreter_fp16_gt_op->exec_plan(); + return kTfLiteOk; + }; + context->GetNodeAndRegistration = [](struct TfLiteContext*, int node_index, + TfLiteNode** node, + TfLiteRegistration** registration) { + auto& node_and_reg = interpreter_fp16_gt_op->GetSubgraph() + ->nodes_and_registration()[node_index]; + *node = &node_and_reg.first; + *registration = &node_and_reg.second; + return kTfLiteOk; + }; + + TfLiteIntArray* ops_to_replace = GetOpsToReplace(context); + + // No nodes were found to replace. + EXPECT_EQ(ops_to_replace->size, 0); + // Inputs to Greater op are still fp32. + TfLiteNode* node = nullptr; + TfLiteRegistration* registration = nullptr; + const int kGreaterOpIndex = 2; + context->GetNodeAndRegistration(context, kGreaterOpIndex, &node, + ®istration); + EXPECT_EQ(context->tensors[node->inputs->data[0]].type, + TfLiteType::kTfLiteFloat32); + EXPECT_EQ(context->tensors[node->inputs->data[1]].type, + TfLiteType::kTfLiteFloat32); + TfLiteIntArrayFree(ops_to_replace); +} + class InterpreterFp32 { public: InterpreterFp32() { From 56f400a5faba1b87ac86ea6e8be772a2498f2b6e Mon Sep 17 00:00:00 2001 From: Smit Hinsu Date: Fri, 19 Jul 2019 15:19:38 -0700 Subject: [PATCH 0198/3053] Combine TensorRT calibration and cache resources Move calibration resource member variables to the cache resource. Calibrator is still global for all shapes. In a follow-up CL, there will be calibrator for each member of the cache. PiperOrigin-RevId: 259048795 --- tensorflow/compiler/tf2tensorrt/BUILD | 2 - .../tf2tensorrt/convert/convert_graph.cc | 1 - .../tf2tensorrt/convert/convert_nodes.cc | 1 - .../tf2tensorrt/convert/convert_nodes.h | 1 - .../kernels/get_calibration_data_op.cc | 19 +++-- .../tf2tensorrt/kernels/trt_engine_op.cc | 45 ++++++------ .../tf2tensorrt/kernels/trt_engine_op_test.cc | 1 - .../tf2tensorrt/utils/calibration_resource.cc | 61 ---------------- .../tf2tensorrt/utils/calibration_resource.h | 72 ------------------- .../tf2tensorrt/utils/trt_lru_cache.cc | 2 + .../tf2tensorrt/utils/trt_lru_cache.h | 39 ++++++++++ .../python/compiler/tensorrt/trt_convert.py | 29 ++++---- 12 files changed, 87 insertions(+), 186 deletions(-) delete mode 100644 tensorflow/compiler/tf2tensorrt/utils/calibration_resource.cc delete mode 100644 tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD index bfaae215709..fee4d8a4f5a 100644 --- a/tensorflow/compiler/tf2tensorrt/BUILD +++ b/tensorflow/compiler/tf2tensorrt/BUILD @@ -235,12 +235,10 @@ tf_custom_op_py_library( tf_cuda_library( name = "trt_resources", srcs = [ - "utils/calibration_resource.cc", "utils/trt_int8_calibrator.cc", "utils/trt_lru_cache.cc", ], hdrs = [ - "utils/calibration_resource.h", "utils/trt_int8_calibrator.h", "utils/trt_lru_cache.h", ], diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index fb5dda9953e..d5004af7147 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -28,7 +28,6 @@ limitations under the License. #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/compiler/tf2tensorrt/segment/segment.h" -#include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h" #include "tensorflow/core/common_runtime/gpu/gpu_id.h" #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h" #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h" diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc index c068c4cc06c..3d223d77108 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc @@ -29,7 +29,6 @@ limitations under the License. #include "absl/strings/str_cat.h" #include "absl/strings/string_view.h" #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" -#include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" #include "tensorflow/core/framework/node_def.pb.h" // NOLINT #include "tensorflow/core/framework/node_def_builder.h" diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h index a6a7afe121e..c4249ff5c1b 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h @@ -23,7 +23,6 @@ limitations under the License. #include #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" -#include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" diff --git a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc index 2898602b879..7af6052446d 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc @@ -16,7 +16,7 @@ limitations under the License. #include #include -#include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/resource_mgr.h" @@ -39,27 +39,26 @@ class GetCalibrationDataOp : public OpKernel { // TODO(laigd): it will allocate the tensor on the device and copy the // serialized string to that tensor, and later sess.run() will copy it back // to host. We need to optimize this. - const string& resource_name = context->input(0).scalar()(); + const string& resource_name = context->input(0).scalar()(); // Get the resource. - TRTCalibrationResource* resource = nullptr; + TRTEngineCacheResource* resource = nullptr; OP_REQUIRES_OK(context, context->resource_manager()->Lookup( - std::string(kCalibrationContainerName), - resource_name, &resource)); + std::string(kCacheContainerName), resource_name, + &resource)); core::ScopedUnref sc(resource); + auto* calib_ctx = resource->calib_ctx_.get(); + // Serialize the resource as output. string serialized_resource; - OP_REQUIRES_OK(context, resource->SerializeToString(&serialized_resource)); + OP_REQUIRES_OK(context, calib_ctx->SerializeToString(&serialized_resource)); + resource->calib_ctx_.reset(); Tensor* output = nullptr; OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({}), &output)); - // Destroy the resource. - OP_REQUIRES_OK(context, - context->resource_manager()->Delete( - std::string(kCalibrationContainerName), resource_name)); output->scalar()() = serialized_resource; } }; diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index ab0b21edc41..2494e033cd6 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -21,7 +21,6 @@ limitations under the License. #include "absl/strings/string_view.h" #include "tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h" #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" -#include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h" @@ -101,8 +100,7 @@ class TRTEngineOp : public AsyncOpKernel { // Allocate necessary resources for calibration Status AllocateCalibrationResources(OpKernelContext* ctx, - TRTEngineCacheResource* cache_res, - TRTCalibrationResource** cr); + TRTEngineCacheResource* cache_res); Status GetEngineCacheResource(OpKernelContext* ctx, TRTEngineCacheResource** cache_res); @@ -278,22 +276,13 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx, VLOG(1) << "Executing TRT calibration: " << name(); helper->Ref(); core::ScopedUnref sc(helper); - // Get the cache resource outside the LookupOrCreate() below to avoid - // deadlock. + TRTEngineCacheResource* cache_res = nullptr; OP_REQUIRES_OK_ASYNC(ctx, GetEngineCacheResource(ctx, &cache_res), *helper); core::ScopedUnref unref_cache_res(cache_res); - TRTCalibrationResource* calib_res = nullptr; - OP_REQUIRES_OK_ASYNC( - ctx, - ctx->resource_manager()->LookupOrCreate( - std::string(kCalibrationContainerName), name(), - reinterpret_cast(&calib_res), - {[ctx, cache_res, this](TRTCalibrationResource** cr) -> Status { - return this->AllocateCalibrationResources(ctx, cache_res, cr); - }}), - *helper); - core::ScopedUnref calib_sc(calib_res); + + CalibrationContext* calib_ctx = cache_res->calib_ctx_.get(); + int num_inputs = ctx->num_inputs(); // TODO(laigd): need to check that input shape matches. // Pass input data to calibrator @@ -307,7 +296,7 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx, *helper); // Check the allocated buffer is sufficient for input const auto device_tensor = - calib_res->device_tensors_.at(i).AccessTensor(ctx); + calib_ctx->device_tensors_.at(i).AccessTensor(ctx); CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes()); input_data.emplace(StrCat(kInputPHName, i), data_address); } @@ -326,7 +315,7 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx, // until setDone() is called later by the calibration thread in // AllocateCalibrationResources(). In that case, this setBatch() will always // be able to detect the error and return false. - OP_REQUIRES_ASYNC(ctx, calib_res->calibrator_->setBatch(input_data, *stream), + OP_REQUIRES_ASYNC(ctx, calib_ctx->calibrator_->setBatch(input_data, *stream), errors::Internal("Failed to feed calibration data"), *helper); VLOG(2) << "Passed calibration data"; @@ -580,9 +569,12 @@ Status TRTEngineOp::GetEngineCacheResource(OpKernelContext* ctx, // Get engine cache. return ctx->resource_manager()->LookupOrCreate( - "TF-TRT-Engine-Cache", string(resource_name), cache_res, + std::string(kCacheContainerName), std::string(resource_name), cache_res, {[this, ctx](TRTEngineCacheResource** cr) -> Status { *cr = new TRTEngineCacheResource(ctx, this->max_cached_engines_); + if (calibration_mode_) { + TF_RETURN_IF_ERROR(AllocateCalibrationResources(ctx, *cr)); + } return Status::OK(); }}); } @@ -694,11 +686,13 @@ StatusOr TRTEngineOp::GetEngine( return cache.at(engine_input_shapes).get(); } +// TODO(hinsu): Move this allocation to CalibrationContext constructor, if +// possible. Status TRTEngineOp::AllocateCalibrationResources( - OpKernelContext* ctx, TRTEngineCacheResource* cache_res, - TRTCalibrationResource** cr) { - auto cres = new TRTCalibrationResource(); - *cr = cres; + OpKernelContext* ctx, TRTEngineCacheResource* cache_res) { + cache_res->calib_ctx_ = absl::make_unique(); + auto* cres = cache_res->calib_ctx_.get(); + // Get the input shapes. const int batch_size = ctx->input(0).dim_size(0); const int num_inputs = ctx->num_inputs(); @@ -758,8 +752,9 @@ Status TRTEngineOp::AllocateCalibrationResources( auto s = convert::ConvertGraphDefToEngine( this->segment_graph_, TrtPrecisionMode::INT8, cres->calibrator_->getBatchSize(), this->workspace_size_, - partial_shapes, &cres->logger_, cache_res->allocator_.get(), - cres->calibrator_.get(), &cres->engine_, + partial_shapes, &cache_res->GetLogger(), + cache_res->allocator_.get(), cres->calibrator_.get(), + &cres->engine_, /*use_calibration=*/true, /*convert_successfully=*/nullptr); if (!s.ok()) { diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc index d859d5f957f..1c08061f398 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc @@ -23,7 +23,6 @@ limitations under the License. #include #include #include "tensorflow/cc/ops/standard_ops.h" -#include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h" #include "tensorflow/core/framework/fake_input.h" #include "tensorflow/core/framework/node_def_builder.h" diff --git a/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.cc b/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.cc deleted file mode 100644 index 5d6e11b536e..00000000000 --- a/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.cc +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h" - -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT - -namespace tensorflow { -namespace tensorrt { - -const absl::string_view kCalibrationContainerName = "TF-TRT-Calibration"; - -TRTCalibrationResource::~TRTCalibrationResource() { - VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString(); -} - -string TRTCalibrationResource::DebugString() const { - std::stringstream oss; - using std::dec; - using std::endl; - using std::hex; - oss << " Calibrator = " << hex << calibrator_.get() << dec << endl - << " Builder = " << hex << builder_.get() << dec << endl - << " Engine = " << hex << engine_.get() << dec << endl - << " Logger = " << hex << &logger_ << dec << endl - << " Thread = " << hex << thr_.get() << dec << endl; - return oss.str(); -} - -void TRTCalibrationResource::SetCalibrationTable() { - calibration_table_ = calibrator_->getCalibrationTableAsString(); -} - -Status TRTCalibrationResource::SerializeToString(string* serialized) { - calibrator_->waitAndSetDone(); - thr_->join(); - *serialized = calibration_table_; - if (serialized->empty()) { - return errors::Unknown("Calibration table is empty."); - } - return Status::OK(); -} - -} // namespace tensorrt -} // namespace tensorflow - -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA diff --git a/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h b/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h deleted file mode 100644 index e7c29e9f1ed..00000000000 --- a/tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCES_H_ -#define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCES_H_ - -#include -#include -#include -#include -#include -#include - -#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" -#include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h" -#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h" -#include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/resource_mgr.h" - -#if GOOGLE_CUDA -#if GOOGLE_TENSORRT -#include "third_party/tensorrt/NvInfer.h" - -namespace tensorflow { -namespace tensorrt { - -ABSL_CONST_INIT extern const absl::string_view kCalibrationContainerName; - -class TRTCalibrationResource : public ResourceBase { - public: - ~TRTCalibrationResource() override; - - string DebugString() const override; - - void SetCalibrationTable(); - - Status SerializeToString(string* serialized); - - // Lookup table for temporary staging areas of input tensors for calibration. - std::unordered_map> device_buffers_; - - // Temporary staging areas for calibration inputs. - std::vector device_tensors_; - - string calibration_table_; - std::unique_ptr calibrator_; - TrtUniquePtrType builder_; - TrtUniquePtrType engine_; - Logger logger_; - // TODO(sami): Use threadpool threads! - std::unique_ptr thr_; -}; - -} // namespace tensorrt -} // namespace tensorflow - -#endif // GOOGLE_TENSORRT -#endif // GOOGLE_CUDA -#endif // TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_RESOURCES_H_ diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc index 43dcd52b5a2..d518a378510 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc @@ -30,6 +30,8 @@ limitations under the License. namespace tensorflow { namespace tensorrt { +const absl::string_view kCacheContainerName = "TF-TRT-Engine-Cache"; + Logger& TRTEngineCacheResource::GetLogger() { static Logger* logger = new Logger(); return *logger; diff --git a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h index 442e0bcfb53..df25ee0ef1d 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h +++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h @@ -17,10 +17,12 @@ limitations under the License. #define TENSORFLOW_COMPILER_TF2TENSORRT_UTILS_TRT_LRU_CACHE_H_ #include +#include #include #include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_allocator.h" +#include "tensorflow/compiler/tf2tensorrt/utils/trt_int8_calibrator.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h" #include "tensorflow/core/framework/resource_mgr.h" #include "tensorflow/core/lib/core/errors.h" @@ -137,6 +139,39 @@ struct EngineContext { GUARDED_BY(mu); }; +// Contains the context required to build the calibration data. +class CalibrationContext { + public: + void SetCalibrationTable() { + calibration_table_ = calibrator_->getCalibrationTableAsString(); + } + + Status SerializeToString(string* serialized) { + calibrator_->waitAndSetDone(); + thr_->join(); + *serialized = calibration_table_; + if (serialized->empty()) { + return errors::Unknown("Calibration table is empty."); + } + return Status::OK(); + } + + // Lookup table for temporary staging areas of input tensors for calibration. + std::unordered_map> device_buffers_; + + // Temporary staging areas for calibration inputs. + std::vector device_tensors_; + + string calibration_table_; + std::unique_ptr calibrator_; + TrtUniquePtrType builder_; + TrtUniquePtrType engine_; + // TODO(sami): Use threadpool threads! + std::unique_ptr thr_; +}; + +ABSL_CONST_INIT extern const absl::string_view kCacheContainerName; + class TRTEngineCacheResource : public ResourceBase { public: // According to the TensorRT API, the logger is considered a singleton by the @@ -159,6 +194,10 @@ class TRTEngineCacheResource : public ResourceBase { LRUCache, std::unique_ptr, VectorTensorShapeHasher> cache_; + + // TODO(hinsu): Use different calibration context for the available shapes and + // attach it to each item of the cache. + std::unique_ptr calib_ctx_; }; #endif // GOOGLE_TENSORRT diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py index 982c4fea641..b11938aecc3 100644 --- a/tensorflow/python/compiler/tensorrt/trt_convert.py +++ b/tensorflow/python/compiler/tensorrt/trt_convert.py @@ -283,6 +283,19 @@ def get_tensorrt_rewriter_config( return rewriter_config_with_trt +# Remove all scope prefixes in the node name. In TF 2.0, the same concrete +# function can be initialized multiple times with different prefixes, and +# this will result in the same TRTEngineOp being initialized multiple times +# with different cache and duplicate TRT engines. +# TODO(laigd): this may be caused by the fact that TRTEngineOp is not +# stataful, need to investigate. +# TODO(laigd): we rely on the fact that all functions are fully inlined +# before TF-TRT optimizer is called, as otherwise it may generate the same +# name when optimizing a different function graph. Fix this. +def _get_canonical_engine_name(name): + return name.split("/")[-1] + + class TrtGraphConverter(object): """A converter for TF-TRT transformation for TF 1.x GraphDef/SavedModels. @@ -626,7 +639,9 @@ class TrtGraphConverter(object): # Get the calibration resource. calibration_result = calibration_sess.run( device_to_get_resource_op_map[node.device], - feed_dict={resource_name_input: node.name}) + feed_dict={ + resource_name_input: _get_canonical_engine_name(node.name) + }) node.attr["calibration_data"].s = calibration_result self._calibration_data_collected = True @@ -944,19 +959,9 @@ class TrtGraphConverterV2(object): canonical_engine_name, filename, self._conversion_params.maximum_cached_engines) - # Remove all scope prefixes in the node name. In TF 2.0, the same concrete - # function can be initialized multiple times with different prefixes, and - # this will result in the same TRTEngineOp being initialized multiple times - # with different cache and duplicate TRT engines. - # TODO(laigd): this may be caused by the fact that TRTEngineOp is not - # stataful, need to investigate. - # TODO(laigd): we rely on the fact that all functions are fully inlined - # before TF-TRT optimizer is called, as otherwise it may generate the same - # name when optimizing a different function graph. Fix this. - canonical_engine_name = lambda node: node.name.split("/")[-1] for node in self._converted_graph_def.node: if node.op == _TRT_ENGINE_OP_NAME: - _serialize_and_track_engine(canonical_engine_name(node)) + _serialize_and_track_engine(_get_canonical_engine_name(node.name)) for func in self._converted_graph_def.library.function: for node in func.node_def: if node.op == _TRT_ENGINE_OP_NAME: From 18a9074060e3e78495e20141effd96c5da732479 Mon Sep 17 00:00:00 2001 From: Trevor Morris Date: Fri, 19 Jul 2019 15:54:40 -0700 Subject: [PATCH 0199/3053] Improve code based on reviewer feedback --- tensorflow/python/keras/layers/core.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py index b21801786d9..117e2d9749c 100644 --- a/tensorflow/python/keras/layers/core.py +++ b/tensorflow/python/keras/layers/core.py @@ -580,9 +580,10 @@ class Flatten(Layer): permutation.append(1) inputs = array_ops.transpose(inputs, perm=permutation) - input_shape = tensor_shape.TensorShape(inputs.shape).as_list() - if input_shape and all(input_shape[1:]): - outputs = array_ops.reshape(inputs, (-1, int(np.prod(input_shape[1:])))) + input_shape = inputs.shape + if input_shape[1:].is_fully_defined(): + outputs = array_ops.reshape( + inputs, (-1, tensor_shape.dimension_value(np.prod(input_shape[1:])))) else: outputs = array_ops.reshape( inputs, (tensor_shape.dimension_value(inputs.shape[0]) or From 5f50f97e9e0872d114a6acafecd848159b10247c Mon Sep 17 00:00:00 2001 From: Juhyun Lee Date: Fri, 19 Jul 2019 16:01:27 -0700 Subject: [PATCH 0200/3053] TFLite GPU OpenGL: Introduce GeneratedCode.shared_variables. PiperOrigin-RevId: 259055618 --- tensorflow/lite/delegates/gpu/gl/kernels/add.cc | 4 ++++ tensorflow/lite/delegates/gpu/gl/kernels/concat.cc | 4 ++++ tensorflow/lite/delegates/gpu/gl/kernels/conv.cc | 2 ++ tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc | 1 + tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc | 2 ++ tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc | 1 + tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc | 1 + tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.cc | 1 + tensorflow/lite/delegates/gpu/gl/kernels/mul.cc | 3 +++ tensorflow/lite/delegates/gpu/gl/kernels/pad.cc | 1 + tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc | 2 ++ tensorflow/lite/delegates/gpu/gl/kernels/prelu.cc | 4 ++++ tensorflow/lite/delegates/gpu/gl/kernels/relu.cc | 1 + tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc | 1 + tensorflow/lite/delegates/gpu/gl/kernels/slice.cc | 1 + tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc | 1 + tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc | 1 + .../lite/delegates/gpu/gl/kernels/upsampling_bilinear.cc | 2 ++ tensorflow/lite/delegates/gpu/gl/node_shader.h | 3 +++ 19 files changed, 36 insertions(+) diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/add.cc b/tensorflow/lite/delegates/gpu/gl/kernels/add.cc index e1073299ecd..7c461e506f8 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/add.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/add.cc @@ -50,6 +50,7 @@ class Add : public NodeShader { *generated_code = { /*parameters=*/{}, /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/ @@ -72,6 +73,7 @@ class Add : public NodeShader { *generated_code = { /*parameters=*/{}, /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/std::move(code), @@ -85,6 +87,7 @@ class Add : public NodeShader { *generated_code = { /*parameters=*/{{"scalar", *scalar}}, /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/"value_0 += $scalar$;", @@ -96,6 +99,7 @@ class Add : public NodeShader { *generated_code = { /*parameters=*/{}, /*objects=*/{{"add_buffer", MakeReadonlyObject(adds->data)}}, + /*shared_variables=*/{}, // Declare workload explicitly because shader depends on gid.z. /*workload=*/ uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4)), diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/concat.cc b/tensorflow/lite/delegates/gpu/gl/kernels/concat.cc index c6cdb078a6d..a97d618e0b6 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/concat.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/concat.cc @@ -87,6 +87,7 @@ class AlignedConcatByChannels : public NodeShader { *generated_code = { /*parameters=*/{{"border", inputs[0]->tensor.shape.c / 4}}, /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/std::move(source), @@ -174,6 +175,7 @@ class ConcatByAnyChannel : public NodeShader { *generated_code = { /*parameters=*/{}, /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(output->tensor.shape.w, output->tensor.shape.h, 1), /*workgroup=*/uint3(), /*source_code=*/std::move(code), @@ -373,6 +375,7 @@ class FlatConcatByHeight : public NodeShader { *generated_code = { /*parameters=*/std::move(params), /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/std::move(code), @@ -439,6 +442,7 @@ class FlatConcatByWidth : public NodeShader { *generated_code = { /*parameters=*/std::move(params), /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/std::move(code), diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc b/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc index 0314b959e64..9a1c665f763 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc @@ -105,6 +105,7 @@ class Convolution : public NodeShader { *generated_code = { /*parameters=*/std::move(parameters), /*objects=*/std::move(objects), + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/ GetIdealWorkgroupIfPossible( @@ -241,6 +242,7 @@ class Convolution1x1 : public NodeShader { *generated_code = { /*parameters=*/std::move(parameters), /*objects=*/std::move(objects), + /*shared_variables=*/{}, /*workload=*/ uint3(output->tensor.shape.w / multiplier, output->tensor.shape.h, IntegralDivideRoundUp(output->tensor.shape.c, 4)), diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc index c82723954b9..cc85211d178 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc @@ -98,6 +98,7 @@ class DepthwiseConvolution : public NodeShader { *generated_code = { /*parameters=*/std::move(parameters), /*objects=*/std::move(objects), + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/ GetIdealWorkgroupIfPossible( diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc index 8ad2679e62e..fb4f0a512a5 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/elementwise.cc @@ -90,6 +90,7 @@ class ElementwiseOneArgument : public NodeShader { *generated_code = { /*parameters=*/{}, /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), source, @@ -160,6 +161,7 @@ class ElementwiseTwoArguments : public NodeShader { *generated_code = { /*parameters=*/{}, /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/source, diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc b/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc index f6c7526b5eb..bef337f9d24 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/fully_connected.cc @@ -65,6 +65,7 @@ class FullyConnectedBuffers : public NodeShader { *generated_code = { /*parameters=*/std::move(parameters), /*objects=*/std::move(objects), + /*shared_variables=*/{}, /*workload=*/ uint3(1, 1, IntegralDivideRoundUp(attr.weights.shape.o, 4)), /*workgroup=*/uint3(), diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc b/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc index 696d5257598..e248cdfb31a 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/lstm.cc @@ -73,6 +73,7 @@ class LstmNodeShader : public NodeShader { *generated_code = { /*parameters=*/{}, /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/std::move(code), diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.cc b/tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.cc index fd9302cb00c..2e977625489 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/max_unpooling.cc @@ -59,6 +59,7 @@ class MaxUnpooling : public NodeShader { *generated_code = { /*parameters=*/std::move(parameters), /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/std::move(source), diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc b/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc index f57eaa70578..542b64ec2b3 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/mul.cc @@ -76,6 +76,7 @@ class ApplyMask : public NodeShader { *generated_code = { /*parameters=*/{}, /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/std::move(source), @@ -99,6 +100,7 @@ class MultiplyScalar : public NodeShader { *generated_code = { /*parameters=*/{{"scalar", *scalar}}, /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/"value_0 *= $scalar$;", @@ -113,6 +115,7 @@ class MultiplyScalar : public NodeShader { *generated_code = { /*parameters=*/{}, /*objects=*/{{"mul_buffer", MakeReadonlyObject(muls->data)}}, + /*shared_variables=*/{}, // Declare workload explicitly because shader depends on gid.z. /*workload=*/ uint3(shape.w, shape.h, IntegralDivideRoundUp(shape.c, 4)), diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/pad.cc b/tensorflow/lite/delegates/gpu/gl/kernels/pad.cc index a27835bbf36..a3a3ac75e60 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/pad.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/pad.cc @@ -72,6 +72,7 @@ class Pad : public NodeShader { *generated_code = { /*parameters=*/std::move(parameters), /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/std::move(source), diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc b/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc index ace3e801c54..8f140c33fca 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/pooling.cc @@ -87,6 +87,7 @@ Status GenerateMaxPoolingCode(const Pooling2DAttributes& attr, *generated_code = { /*parameters=*/std::move(parameters), /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/std::move(source), @@ -128,6 +129,7 @@ Status GenerateAveragePoolingCode(const Pooling2DAttributes& attr, *generated_code = { /*parameters=*/std::move(parameters), /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/std::move(source), diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/prelu.cc b/tensorflow/lite/delegates/gpu/gl/kernels/prelu.cc index 0662fcf8907..80df527ffa4 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/prelu.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/prelu.cc @@ -56,6 +56,7 @@ class PReLULinearAlpha : public NodeShader { ? GeneratedCode{ /*parameters=*/{{"clip", attr.clip}}, /*objects=*/{{"alpha", MakeReadonlyObject(alpha->data)}}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), "value_0 = clamp(value_0, 0.0, $clip$) + $alpha[gid.z]$ * " @@ -66,6 +67,7 @@ class PReLULinearAlpha : public NodeShader { : GeneratedCode{ /*parameters=*/{}, /*objects=*/{{"alpha", MakeReadonlyObject(alpha->data)}}, + /*shared_variables=*/{}, // Declare workload explicitly because shader depends on // gid.z. /*workload=*/ @@ -109,6 +111,7 @@ class PReLUFull : public NodeShader { /*objects=*/ {{"alpha", MakeReadonlyObject(obj_size, ConvertToPHWC4(*alpha))}}, + /*shared_variables=*/{}, // Declare workload explicitly because shader // depends on gid.z. /*workload=*/ @@ -125,6 +128,7 @@ class PReLUFull : public NodeShader { /*objects=*/ {{"alpha", MakeReadonlyObject(obj_size, ConvertToPHWC4(*alpha))}}, + /*shared_variables=*/{}, // Declare workload explicitly because shader depends on // gid.z. /*workload=*/ diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc b/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc index aa5c6e855bc..a8e006ed151 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/relu.cc @@ -55,6 +55,7 @@ class ReLU : public NodeShader { *generated_code = { /*parameters=*/std::move(params), /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/std::move(code), diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc b/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc index f2c0dc50e0b..5a0b6d7e3c3 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/reshape.cc @@ -76,6 +76,7 @@ class Reshape : public NodeShader { {"output_channels", output->tensor.shape.c}, }, /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/std::move(code), diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc b/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc index 678aa7a00ee..d0fe1923d4e 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/slice.cc @@ -100,6 +100,7 @@ class Slice : public NodeShader { *generated_code = { /*parameters=*/std::move(parameters), /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/std::move(code), diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc b/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc index 04c80937676..9067ec956c5 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/softmax.cc @@ -76,6 +76,7 @@ class SoftMax : public NodeShader { *generated_code = { /*parameters=*/std::move(parameters), /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(output->tensor.shape.w, output->tensor.shape.h, 1), /*workgroup=*/uint3(), /*source_code=*/std::move(source), diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc b/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc index 4682765421a..b9ecd09202b 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/transpose_conv.cc @@ -95,6 +95,7 @@ class ConvolutionTransposedBuffers : public NodeShader { *generated_code = { /*parameters=*/std::move(parameters), /*objects=*/std::move(objects), + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/source, diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/upsampling_bilinear.cc b/tensorflow/lite/delegates/gpu/gl/kernels/upsampling_bilinear.cc index a30e5ad8e17..96708db84a8 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/upsampling_bilinear.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/upsampling_bilinear.cc @@ -62,6 +62,7 @@ class UpsamplingBilinear : public NodeShader { *generated_code = { /*parameters=*/{}, /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/"value_0 = $input_data_0[0, 0, gid.z]$;", @@ -100,6 +101,7 @@ class UpsamplingBilinear : public NodeShader { *generated_code = { /*parameters=*/std::move(parameters), /*objects=*/{}, + /*shared_variables=*/{}, /*workload=*/uint3(), /*workgroup=*/uint3(), /*source_code=*/std::move(source), diff --git a/tensorflow/lite/delegates/gpu/gl/node_shader.h b/tensorflow/lite/delegates/gpu/gl/node_shader.h index 710d4b6d5e8..310719e23c9 100644 --- a/tensorflow/lite/delegates/gpu/gl/node_shader.h +++ b/tensorflow/lite/delegates/gpu/gl/node_shader.h @@ -63,6 +63,9 @@ struct GeneratedCode { // A list of objects to bind before shader could be executed. std::vector> objects; + // A list of shared variables in the shader program. + std::vector shared_variables; + // Compute shader operate on an abstract concept of work groups, each // three-dimensional. The number of work groups to be executed is defined by // workload tuple. Therefore, From 77e0f48970568b7a55b5c280727302e734376010 Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Fri, 19 Jul 2019 16:20:38 -0700 Subject: [PATCH 0201/3053] [XLA GPU] Very minor cleanups in ir_emitter_unnested PiperOrigin-RevId: 259058752 --- .../xla/service/gpu/ir_emitter_unnested.cc | 45 +++++++++---------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 168156edf8e..de7fab3304e 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2600,8 +2600,7 @@ void IrEmitterUnnested::EmitPrologueForOneReduction( HloInstruction* unnested_hlo, HloInstruction* reduce_inst, int reduce_idx, KernelCodegenInfo* kernel_info, GpuElementalIrEmitter* elemental_emitter, ShapeIndex output_shape_index) { - ReductionCodegenInfo* reduction_info = - static_cast(kernel_info); + auto reduction_info = static_cast(kernel_info); InlinedVector* reducers = reduction_info->GetMutableReducers(); @@ -2660,8 +2659,7 @@ void IrEmitterUnnested::EmitPrologueForReduction( : unnested_hlo; absl::Span output_instructions = GetOutputInstructions(&reduce_or_tuple); - ReductionCodegenInfo* reduction_info = - static_cast(kernel_info); + auto reduction_info = static_cast(kernel_info); GpuElementalIrEmitter elemental_emitter(hlo_module_config_, ir_emitter_context_->llvm_module(), &b_, GetNestedComputer()); @@ -2734,8 +2732,7 @@ void IrEmitterUnnested::EmitFullWarpShuffleDownLoopForAllReduces( void IrEmitterUnnested::EmitEpilogueForReduction( HloInstruction* unnested_hlo, KernelCodegenInfo* kernel_info) { - ReductionCodegenInfo* reduction_info = - static_cast(kernel_info); + auto reduction_info = static_cast(kernel_info); int num_reduces = reduction_info->GetNumberOfReduces(); absl::Span partial_result_addresses = reduction_info->GetPartialResultAddresses(); @@ -2850,8 +2847,7 @@ void IrEmitterUnnested::EmitTileElementForReduction( tiled_param_info->set_x(x_loc); // Record the untransposed output linear address for the reduction. - const ReductionCodegenInfo* reduction_info = - dynamic_cast(kernel_info); + auto reduction_info = dynamic_cast(kernel_info); int partial_result_index = reduction_info->IsRowReduction() ? 0 : x_iter_num; Store(reduction_info->GetUntransposedOutputLinearAddress(&b_, index), InBoundsGEP(reduction_info->GetCurrentOutputLinearIndexAddress(), @@ -3036,11 +3032,15 @@ void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile, // Emits a kernel for the hlo instruction using the given kernel mapping scheme. // +// The emitted code is written into the member variable b_, which corresponds to +// the kernel thunk currently being constructed (previous call to +// BuildKernelThunk). +// // unnested_hlo: The unnested hlo instruction for which the kernel is generated. // Currently, these hlo instructions are supported: kLoop fusion, kCopy. // tiled_param_ids: The IDs for the parameters that are 0-2-1 transpose of // other tensors with the same dimensions and are safe to be tranposed via -// the shared memory tranpose implementation. +// the shared memory transpose implementation. // mapping_scheme: The tiling scheme to use. // kernel_generator: Contains function objects for code generation, such as // element generator, block prologue and epilogue generators. @@ -3067,14 +3067,12 @@ LaunchDimensions IrEmitterUnnested::EmitKernel( << llvm_ir::DumpToString(*param_shmem_buffers[id]); } - const ReductionCodegenInfo* reduction_info = - dynamic_cast(kernel_info); + auto reduction_info = dynamic_cast(kernel_info); bool is_column_reduction = (reduction_info && !reduction_info->IsRowReduction()); - LaunchDimensions launch_dimensions = - LaunchDimensions(mapping_scheme->GetNumberOfBlocks(), - mapping_scheme->GetThreadsPerBlock()); + LaunchDimensions launch_dimensions(mapping_scheme->GetNumberOfBlocks(), + mapping_scheme->GetThreadsPerBlock()); // TODO(b/110211620): Enable int32 index type for column reduction. llvm::Type* index_ty = @@ -3214,7 +3212,7 @@ LaunchDimensions IrEmitterUnnested::EmitKernel( // algorithm to improve the memory access patterns for the input parameters // with a shape that is a 0-2-1 transpose of the output tensor shape. The caller // is responsible for making sure that it is safe to apply the shared memory -// tranpose on the input parameters. +// transpose on the input parameters. // // // For the purpose of tiling, the output tensors have a logical shape of three @@ -3282,7 +3280,7 @@ namespace { // the preload tile. If this is not true, we can't use a shmem transpose for P. // // If the computation of output element [z, y, x] only requires the element of -// P with the same indices, the shmem tranpose implementation can be applied +// P with the same indices, the shmem transpose implementation can be applied // to P safely. This is a sufficient but not necessary condition. We check all // the transitive users of P to see if we can find a user that may cause an // exception to the situation. If such a user is not found, we conclude that P @@ -3302,7 +3300,7 @@ namespace { // block. // // TODO(bixia): In order to extend this for kInput fusion, that is reduction -// with tranpose, we only need to end the use-chain checking with the input of +// with transpose, we only need to end the use-chain checking with the input of // a reduce operations. In this case, the above description on "output" apply // to the result of such a use-chain, which provides the input to the reduce // operation. @@ -3334,9 +3332,9 @@ bool IsInstructionSafeForShmemTranspose(const HloInstruction* hlo) { } } -// Given a group of input parameters that are 0-2-1 tranpose of the outputs of +// Given a group of input parameters that are 0-2-1 transpose of the outputs of // a fusion kernel, returns the input parameters that are safe for the shared -// memory tranpose implementation. +// memory transpose implementation. // // When a tile based shared memory transpose is used to implement an input with // 0-2-1 transpose, we preload a tile of the input elements @@ -3354,8 +3352,7 @@ std::vector FilterInputsForShmemTranspose(const HloInstruction* fusion, if (IsInstructionSafeForShmemTranspose(input)) { filtered_input_ids.push_back(input_ids[i]); } else { - VLOG(10) << "Input not safe for shmem transpose " << input->ToString() - << "\n"; + VLOG(10) << "Input not safe for shmem transpose " << input->ToString(); } } return filtered_input_ids; @@ -3710,13 +3707,13 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions( EmitEpilogueForReduction(hlo, kernel_info); }); - LaunchDimensions launch_dimensions = - EmitKernel(unnested_hlo, {}, kernel_generator, &reduction_info); + LaunchDimensions launch_dimensions = EmitKernel( + unnested_hlo, /*param_ids=*/{}, kernel_generator, &reduction_info); UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(), ir_emitter_context_->llvm_module()); thunks.push_back(std::move(kernel_thunk)); - std::unique_ptr sequential_thunk = + auto sequential_thunk = absl::make_unique(std::move(thunks), unnested_hlo); AddThunkToThunkSequence(std::move(sequential_thunk)); From 606da502cd2087b265abc70c1542a263d41c88dc Mon Sep 17 00:00:00 2001 From: Akshay Modi Date: Fri, 19 Jul 2019 16:28:49 -0700 Subject: [PATCH 0202/3053] Use a different CancellationManager for every execution of function/op The newly added benchmark shows about a 100ns loss on my machine. While not ideal, I believe the added time is justifiable since the loss is only in the op-by-op execution path (when a TF_CancellationManager is not active), and not on the function execution path (which has taken a back seat in the past). The previous behavior was also broken, so continuing to support it seems non-ideal. If this becomes a bottleneck in the future, we can probably explore ways of making it faster. PiperOrigin-RevId: 259059904 --- tensorflow/c/eager/c_api_test.cc | 32 ++++++++++++ tensorflow/c/eager/c_api_test_util.cc | 13 +++++ tensorflow/c/eager/c_api_test_util.h | 3 ++ .../common_runtime/eager/kernel_and_device.cc | 50 +++++++++++-------- .../common_runtime/eager/kernel_and_device.h | 16 +----- tensorflow/core/framework/cancellation.cc | 6 --- tensorflow/core/framework/cancellation.h | 6 +-- tensorflow/python/distribute/BUILD | 1 + .../distribute/distribute_strategy_test.py | 2 - .../keras/engine/training_dataset_test.py | 9 ---- .../experimental/keras_test.py | 2 - 11 files changed, 81 insertions(+), 59 deletions(-) diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc index e80620c9a64..17df7bbaa06 100644 --- a/tensorflow/c/eager/c_api_test.cc +++ b/tensorflow/c/eager/c_api_test.cc @@ -89,6 +89,38 @@ void BM_Execute(int iters, int async) { } BENCHMARK(BM_Execute)->Arg(0)->Arg(1); +void BM_Execute_Identity(int iters, int async) { + tensorflow::testing::StopTiming(); + tensorflow::testing::SetLabel(async ? "ExecuteIdentityAsync" + : "ExecuteIdentity"); + TF_Status* status = TF_NewStatus(); + TFE_ContextOptions* opts = TFE_NewContextOptions(); + TFE_ContextOptionsSetAsync(opts, static_cast(async)); + TFE_Context* ctx = TFE_NewContext(opts, status); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_DeleteContextOptions(opts); + + TFE_TensorHandle* m = TestMatrixTensorHandle(); + TFE_Op* identity = IdentityOp(ctx, m); + TFE_TensorHandle* retvals[1]; + int num_retvals = 1; + tensorflow::testing::StartTiming(); + for (int i = 0; i < iters; ++i) { + TFE_Execute(identity, &retvals[0], &num_retvals, status); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + } + if (async) { + TFE_ContextAsyncWait(ctx, status); + } + tensorflow::testing::StopTiming(); + TFE_DeleteOp(identity); + TFE_DeleteTensorHandle(m); + TFE_DeleteContext(ctx); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TF_DeleteStatus(status); +} +BENCHMARK(BM_Execute_Identity)->Arg(0)->Arg(1); + TEST(CAPI, Context) { TF_Status* status = TF_NewStatus(); TFE_ContextOptions* opts = TFE_NewContextOptions(); diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc index 17d17c0b7f7..10d95e61451 100644 --- a/tensorflow/c/eager/c_api_test_util.cc +++ b/tensorflow/c/eager/c_api_test_util.cc @@ -128,6 +128,19 @@ TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) { return op; } +TFE_Op* IdentityOp(TFE_Context* ctx, TFE_TensorHandle* a) { + TF_Status* status = TF_NewStatus(); + + TFE_Op* op = TFE_NewOp(ctx, "Identity", status); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TFE_OpAddInput(op, a, status); + CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status); + TF_DeleteStatus(status); + TFE_OpSetAttrType(op, "T", TFE_TensorHandleDataType(a)); + + return op; +} + TFE_Op* ShapeOp(TFE_Context* ctx, TFE_TensorHandle* a) { TF_Status* status = TF_NewStatus(); diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h index 4ff3ff4301f..d0c20ac3743 100644 --- a/tensorflow/c/eager/c_api_test_util.h +++ b/tensorflow/c/eager/c_api_test_util.h @@ -43,6 +43,9 @@ TFE_TensorHandle* TestMatrixTensorHandle3X2(); // Return a matmul op multiplying `a` by `b`. TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b); +// Return an identity op. +TFE_Op* IdentityOp(TFE_Context* ctx, TFE_TensorHandle* a); + // Return a shape op fetching the shape of `a`. TFE_Op* ShapeOp(TFE_Context* ctx, TFE_TensorHandle* a); diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc index 432278486eb..eb7b1b7eb23 100644 --- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc +++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc @@ -31,6 +31,7 @@ limitations under the License. #include "tensorflow/core/framework/types.h" #include "tensorflow/core/framework/types.pb.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/refcount.h" #include "tensorflow/core/lib/gtl/map_util.h" #include "tensorflow/core/lib/gtl/stl_util.h" #include "tensorflow/core/lib/random/random.h" @@ -69,16 +70,6 @@ KernelAndDeviceFunc::~KernelAndDeviceFunc() { } } -KernelAndDeviceOp::~KernelAndDeviceOp() { - // Make sure that the device execution has finished before deleting cm_. - { - mutex_lock lock(num_deferred_ops_mu_); - while (num_deferred_ops_ > 0) { - no_deferred_ops_cv_.wait(lock); - } - } -} - Status KernelAndDeviceOp::Init(const NodeDef& ndef, GraphCollector* graph_collector) { OpKernel* k = nullptr; @@ -230,6 +221,15 @@ void UpdateStats(OpKernelContext* context, ms->set_persistent_memory_size(context->persistent_memory_allocated()); step_stats_collector->Finalize(); } + +// In certain contexts (e.g. TPU async executions), the CancellationManager is +// used to shut down the device in error scenarios (as opposed to using the +// AsyncCompute's DoneCallback). This is handled through the +// {inc,dec}_num_deferred_ops_function. +struct OpExecutionState : public core::RefCounted { + // TODO(nareshmodi): consider refcounting the cancellation_manager. + CancellationManager cancellation_manager; +}; } // anonymous namespace Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container, @@ -269,22 +269,22 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container, params.function_library = flr_; params.slice_reader_cache = &slice_reader_cache_; params.rendezvous = rendez_; + OpExecutionState* op_execution_state = nullptr; if (cancellation_manager) { params.cancellation_manager = cancellation_manager; } else { - params.cancellation_manager = &cm_; - cm_.Reset(); + op_execution_state = new OpExecutionState; + params.cancellation_manager = &op_execution_state->cancellation_manager; } params.log_memory = log_memory_; - params.inc_num_deferred_ops_function = [this]() { - mutex_lock lock(num_deferred_ops_mu_); - num_deferred_ops_++; + params.inc_num_deferred_ops_function = [op_execution_state]() { + if (op_execution_state != nullptr) { + op_execution_state->Ref(); + } }; - params.dec_num_deferred_ops_function = [this]() { - mutex_lock lock(num_deferred_ops_mu_); - num_deferred_ops_--; - if (num_deferred_ops_ == 0) { - no_deferred_ops_cv_.notify_all(); + params.dec_num_deferred_ops_function = [op_execution_state]() { + if (op_execution_state != nullptr) { + op_execution_state->Unref(); } }; std::unique_ptr step_stats_collector; @@ -340,6 +340,12 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container, device_->Compute(kernel_.get(), &context); } } + + // Clean up execution op_execution_state if deferred ops aren't running. + if (op_execution_state != nullptr) { + op_execution_state->Unref(); + } + if (!context.status().ok()) return context.status(); if (outputs != nullptr) { @@ -369,11 +375,11 @@ Status KernelAndDeviceFunc::Run( opts.rendezvous = rendezvous; opts.create_rendezvous = false; + CancellationManager cm; if (cancellation_manager) { opts.cancellation_manager = cancellation_manager; } else { - opts.cancellation_manager = &cm_; - cm_.Reset(); + opts.cancellation_manager = &cm; } opts.allow_dead_tensors = true; opts.step_container = step_container; diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h index 6ec085944ad..e40beb2279b 100644 --- a/tensorflow/core/common_runtime/eager/kernel_and_device.h +++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h @@ -115,11 +115,6 @@ class KernelAndDevice : public core::RefCounted { protected: std::function)>* get_runner() const; - // TODO(apassos) Consider a shared cancellation manager. Note that this - // cancellation manager is not useful to actually cancel anything, and is - // provided here only for the few kernels which can't handle one being - // missing. - CancellationManager cm_; Device* const device_; // can be null Device* const host_cpu_device_; // non-null FunctionLibraryRuntime* const flr_; // can be null @@ -143,7 +138,7 @@ class KernelAndDeviceOp final : public KernelAndDevice { rendez_(rendez), log_memory_(log_memory) {} - virtual ~KernelAndDeviceOp(); + ~KernelAndDeviceOp() override {} Status Init(const NodeDef& ndef, GraphCollector* graph_collector) override; @@ -177,15 +172,6 @@ class KernelAndDeviceOp final : public KernelAndDevice { Rendezvous* const rendez_; checkpoint::TensorSliceReaderCacheWrapper slice_reader_cache_; const bool log_memory_; - - // For deferred ops, AsyncOpKernel::DoneCallback is called once the op is - // enqueued to device. The execution of the op may not finish when - // device_->Compute returns. We rely on no_deferred_ops_cv_ to know when the - // execution has finished. - // Available via OpKernelContext to every OpKernel invocation. - mutex num_deferred_ops_mu_; - condition_variable no_deferred_ops_cv_; - int64 num_deferred_ops_ GUARDED_BY(num_deferred_ops_mu_) = 0; }; // Represents a multi-device function. Functions can also be run using diff --git a/tensorflow/core/framework/cancellation.cc b/tensorflow/core/framework/cancellation.cc index 7f639b5ca9a..af59500aee3 100644 --- a/tensorflow/core/framework/cancellation.cc +++ b/tensorflow/core/framework/cancellation.cc @@ -27,12 +27,6 @@ CancellationManager::CancellationManager() is_cancelled_(false), next_cancellation_token_(0) {} -void CancellationManager::Reset() { - mutex_lock l(mu_); - is_cancelling_ = false; - is_cancelled_.store(false); -} - void CancellationManager::StartCancel() { gtl::FlatMap callbacks_to_run; { diff --git a/tensorflow/core/framework/cancellation.h b/tensorflow/core/framework/cancellation.h index 51b200423ec..d1172ca82ed 100644 --- a/tensorflow/core/framework/cancellation.h +++ b/tensorflow/core/framework/cancellation.h @@ -42,6 +42,9 @@ typedef int64 CancellationToken; // comment for CancellationManager::RegisterCallback. typedef std::function CancelCallback; +// This class should never simultaneously be used as the cancellation manager +// for two separate sets of executions (i.e two separate steps, or two separate +// function executions). class CancellationManager { public: // A value that won't be returned by get_cancellation_token(). @@ -56,9 +59,6 @@ class CancellationManager { // Returns true iff StartCancel() has been called. bool IsCancelled() { return is_cancelled_.load(std::memory_order_acquire); } - // Resets the cancellation manager to its original pre-cancelled state. - void Reset(); - // Returns a token that must be used in calls to RegisterCallback // and DeregisterCallback. CancellationToken get_cancellation_token(); diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD index 6a9f63c290d..91edc480673 100644 --- a/tensorflow/python/distribute/BUILD +++ b/tensorflow/python/distribute/BUILD @@ -1119,6 +1119,7 @@ distribute_py_test( size = "medium", srcs = ["keras_experimental_saved_model_test.py"], main = "keras_experimental_saved_model_test.py", + shard_count = 5, tags = [ "no_oss", # TODO(b/135287893) reenable "no_rocm", diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py index f20fa0b1144..9592b299c87 100644 --- a/tensorflow/python/keras/distribute/distribute_strategy_test.py +++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py @@ -888,8 +888,6 @@ class TestDistributionStrategyWithDatasets(test.TestCase, combinations.combine(run_distributed=[True, False]))) def test_on_dataset_with_unknown_cardinality_without_steps( self, distribution, run_distributed, mode): - if mode == 'eager': - self.skipTest('b/137776821 : Fails with -c opt=-undebug') with self.cached_session(): with distribution.scope(): optimizer_fn = gradient_descent_keras.SGD diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py index b10ea854141..cd3613198fd 100644 --- a/tensorflow/python/keras/engine/training_dataset_test.py +++ b/tensorflow/python/keras/engine/training_dataset_test.py @@ -385,9 +385,6 @@ class TestTrainingWithDataset(keras_parameterized.TestCase): @keras_parameterized.run_all_keras_modes def test_dataset_fit_correctness(self): - if testing_utils.should_run_distributed(): - self.skipTest('b/137776821 : Fails with -c opt=-undebug') - class SumLayer(keras.layers.Layer): def build(self, _): @@ -467,8 +464,6 @@ class TestTrainingWithDataset(keras_parameterized.TestCase): @keras_parameterized.run_with_all_model_types @keras_parameterized.run_all_keras_modes def test_finite_dataset_known_cardinality_no_steps_arg(self): - if testing_utils.should_run_distributed(): - self.skipTest('b/137776821 : Fails with -c opt=-undebug') model = testing_utils.get_small_mlp(1, 4, input_dim=3) model.compile( 'rmsprop', @@ -493,8 +488,6 @@ class TestTrainingWithDataset(keras_parameterized.TestCase): @keras_parameterized.run_with_all_model_types @keras_parameterized.run_all_keras_modes def test_finite_dataset_unknown_cardinality_no_steps_arg(self): - if testing_utils.should_run_distributed(): - self.skipTest('b/137776821 : Fails with -c opt=-undebug') model = testing_utils.get_small_mlp(1, 4, input_dim=3) model.compile( 'rmsprop', @@ -521,8 +514,6 @@ class TestTrainingWithDataset(keras_parameterized.TestCase): @keras_parameterized.run_with_all_model_types @keras_parameterized.run_all_keras_modes(always_skip_v1=True) def test_finite_dataset_unknown_cardinality_no_step_with_train_and_val(self): - if testing_utils.should_run_distributed(): - self.skipTest('b/137776821 : Fails with -c opt=-undebug') class CaptureStdout(object): diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py index d1471b7da0f..ca07a65b9f0 100644 --- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py +++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py @@ -579,8 +579,6 @@ class KerasModelTest(keras_parameterized.TestCase): def test_dynamic_loss_scaling(self, strategy_fn, run_distributed=True): if not self._is_strategy_supported(strategy_fn): return - if run_distributed: - self.skipTest('b/137776821 : Fails with -c opt=-undebug') strategy = strategy_fn() initial_loss_scale = 2. batch_size = 4 From 7d13706efc6663d99aabf1f2cc77aa5db86b3e81 Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Fri, 19 Jul 2019 16:42:26 -0700 Subject: [PATCH 0203/3053] disabling test internally PiperOrigin-RevId: 259062044 --- tensorflow/python/distribute/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD index 91edc480673..79d3b126806 100644 --- a/tensorflow/python/distribute/BUILD +++ b/tensorflow/python/distribute/BUILD @@ -1123,6 +1123,7 @@ distribute_py_test( tags = [ "no_oss", # TODO(b/135287893) reenable "no_rocm", + "notap", # TODO(b/137972256) Re-enable this test. ], deps = [ ":saved_model_test_base", From 425177418fb63a3d6345d5174ffd507fb4f2729b Mon Sep 17 00:00:00 2001 From: Karim Nosir Date: Fri, 19 Jul 2019 16:44:35 -0700 Subject: [PATCH 0204/3053] In case we don't allow dynamic tensor then plan allocation for prepared ops and return failure to modify the graph with the passed delegate. PiperOrigin-RevId: 259062395 --- tensorflow/lite/core/subgraph.cc | 26 ++++++++++------- tensorflow/lite/core/subgraph.h | 3 ++ .../delegates/nnapi/nnapi_delegate_test.cc | 29 +++++++++++++++++++ 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc index a5934270448..acbd41d19b8 100644 --- a/tensorflow/lite/core/subgraph.cc +++ b/tensorflow/lite/core/subgraph.cc @@ -1099,6 +1099,16 @@ TfLiteStatus Subgraph::RedoAllDelegates() { return kTfLiteOk; } +TfLiteStatus Subgraph::EnsureMemoryAllocations() { + if (memory_planner_) { + state_ = kStateUninvokable; + TF_LITE_ENSURE_OK(&context_, memory_planner_->PlanAllocations()); + } + TF_LITE_ENSURE_OK(&context_, AllocateTensors()); + TF_LITE_ENSURE_EQ(&context_, state_, kStateInvokable); + return kTfLiteOk; +} + TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) { // Restore delegation state if applicable. TF_LITE_ENSURE_STATUS(RedoAllDelegates()); @@ -1114,6 +1124,9 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) { TF_LITE_ENSURE_OK(&context_, PrepareOpsStartingAt( 0, &last_execution_plan_index_prepared)); if (has_dynamic_tensors_) { + // Make sure that we are in a defined ready state before returning. + // Plan and allocate tensors before returning. + TF_LITE_ENSURE_OK(&context_, EnsureMemoryAllocations()); ReportError( "Attempting to use a delegate that only supports static-sized " "tensors with a graph that has dynamic-sized tensors."); @@ -1141,26 +1154,17 @@ TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) { TF_LITE_ENSURE_OK(&context_, status); - // If the memory planner has already been created, we need to execute - // planning again to account for the updated graph topology. - if (memory_planner_) { - state_ = kStateUninvokable; - TF_LITE_ENSURE_OK(&context_, memory_planner_->PlanAllocations()); - } - if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) { // Reset the state to force tensor/op reallocation. state_ = kStateUninvokable; - TF_LITE_ENSURE_OK(&context_, AllocateTensors()); - TF_LITE_ENSURE_EQ(&context_, state_, kStateInvokable); + TF_LITE_ENSURE_OK(&context_, EnsureMemoryAllocations()); // After using a delegate which doesn't support dynamic tensors, make the // entire graph immutable. state_ = kStateInvokableAndImmutable; } else if (was_invokable_before_delegate) { // If the graph was invokable prior to delegate application, flush // allocation now to leave it in a consistent state. - TF_LITE_ENSURE_OK(&context_, AllocateTensors()); - TF_LITE_ENSURE_EQ(&context_, state_, kStateInvokable); + TF_LITE_ENSURE_OK(&context_, EnsureMemoryAllocations()); } delegates_applied_.push_back(delegate); diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h index 7776f90429e..0a6bb634cfd 100644 --- a/tensorflow/lite/core/subgraph.h +++ b/tensorflow/lite/core/subgraph.h @@ -457,6 +457,9 @@ class Subgraph { } } + // Ensures the memory required is planned and allocated. + TfLiteStatus EnsureMemoryAllocations(); + // The state of the Interpreter. enum State { // The interpreter isn't ready to be invoked. diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc index c8e9e00d86a..dbbe2124f96 100644 --- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc +++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc @@ -4423,6 +4423,35 @@ class BaseReduceOpModel : public SingleOpModelWithNNAPI { int output_; }; +// Model for the tests case where axis is a dynamic tensor. +class MeanOpDynamicModel : public BaseReduceOpModel { + public: + MeanOpDynamicModel(const TensorData& input, const TensorData& output, + const TensorData& axis, bool keep_dims) { + input_ = AddInput(input); + axis_ = AddInput(axis); + output_ = AddOutput(output); + SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_ReducerOptions, + CreateReducerOptions(builder_, keep_dims).Union()); + BuildInterpreter({GetShape(input_)}); + } +}; + +TEST(DynamicFloatMeanOpTest, NotKeepDims) { + std::vector data = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, + 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, + 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0}; + MeanOpDynamicModel m({TensorType_FLOAT32, {4, 3, 2}}, + {TensorType_FLOAT32, {2}}, {TensorType_INT32, {4}}, + false); + std::vector axis = {1, 0, -3, -3}; + m.SetAxis(axis); + m.SetInput(data); + m.Invoke(); + EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({12, 13}))); +} + // Model for the tests case where axis is a const tensor. class MeanOpConstModel : public BaseReduceOpModel { public: From 6c2cea380e29270d98c8072725d93cdcdaa5d820 Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Fri, 19 Jul 2019 16:46:46 -0700 Subject: [PATCH 0205/3053] [XLA] Add a missing dependency to a local_client library PiperOrigin-RevId: 259062728 --- tensorflow/compiler/xla/python/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD index a6a1bd1830e..0e66e99faeb 100644 --- a/tensorflow/compiler/xla/python/BUILD +++ b/tensorflow/compiler/xla/python/BUILD @@ -222,6 +222,7 @@ cc_library( "//tensorflow/compiler/xla/service:computation_placer", "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/compiler/xla/service:shaped_buffer", + "//tensorflow/core:allocator", "//tensorflow/core:bfc_allocator", "//tensorflow/core:gpu_mem_allocator", "//tensorflow/core:lib", From 076b7765993026c8ac405dfb1e79b43fd73eb5f3 Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Fri, 19 Jul 2019 16:46:52 -0700 Subject: [PATCH 0206/3053] Make use of the CreateBitcast method. This migrates a few instances from CreateUnary to CreateBitcast. Now that we have a dedicated method, it makes sense to use it. PiperOrigin-RevId: 259062744 --- .../compiler/xla/service/algebraic_simplifier.cc | 9 ++++----- .../compiler/xla/service/buffer_assignment_test.cc | 2 +- .../compiler/xla/service/copy_insertion_test.cc | 13 +++++++------ .../compiler/xla/service/hlo_alias_analysis_test.cc | 8 ++++---- .../xla/service/hlo_dataflow_analysis_test.cc | 4 ++-- .../compiler/xla/service/layout_assignment_test.cc | 4 ++-- .../xla/service/tuple_points_to_analysis_test.cc | 4 ++-- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc index eef570e2540..2025cb0f724 100644 --- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc +++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc @@ -431,8 +431,8 @@ void AlgebraicSimplifierVisitor::ReplaceWithBitcast(HloInstruction* instruction, CHECK_EQ(ShapeUtil::ByteSizeOf(instruction->shape()), ShapeUtil::ByteSizeOf(operand->shape())); - auto bitcast = computation_->AddInstruction(HloInstruction::CreateUnary( - instruction->shape(), HloOpcode::kBitcast, operand)); + auto bitcast = computation_->AddInstruction( + HloInstruction::CreateBitcast(instruction->shape(), operand)); TF_CHECK_OK(ReplaceInstruction(instruction, bitcast)); } @@ -573,8 +573,7 @@ Status AlgebraicSimplifierVisitor::HandleBitcast(HloInstruction* bitcast) { HloInstruction* op; if (Match(bitcast, m::Bitcast(m::Bitcast(m::Op(&op))))) { return ReplaceWithNewInstruction( - bitcast, - HloInstruction::CreateUnary(bitcast->shape(), HloOpcode::kBitcast, op)); + bitcast, HloInstruction::CreateBitcast(bitcast->shape(), op)); } // All bitcasts can be eliminated (assuming layout constraints are // satisified). @@ -3807,7 +3806,7 @@ StatusOr AlgebraicSimplifierVisitor::SimplifyConvToDot( std::vector dims(operand->shape().dimensions_size()); std::iota(dims.begin(), dims.end(), 0); return computation_->AddInstruction( - HloInstruction::CreateUnary(shape, HloOpcode::kBitcast, operand)); + HloInstruction::CreateBitcast(shape, operand)); }; // Replace it with a dot, with bitcasts around it to get the right shape. diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc index 3bb98d5d1be..1ca20b6b4f5 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc +++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc @@ -1482,7 +1482,7 @@ TEST_F(BufferAssignmentTest, BitcastAsOutput) { auto param = builder.AddInstruction(HloInstruction::CreateParameter( 0, ShapeUtil::MakeShape(F32, {42}), "param")); auto bitcast = builder.AddInstruction( - HloInstruction::CreateUnary(param->shape(), HloOpcode::kBitcast, param)); + HloInstruction::CreateBitcast(param->shape(), param)); auto module = CreateNewVerifiedModule(); module->AddEntryComputation(builder.Build()); diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc index 6fa3161e578..f0ac579a387 100644 --- a/tensorflow/compiler/xla/service/copy_insertion_test.cc +++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc @@ -235,8 +235,8 @@ TEST_F(CopyInsertionTest, BitcastParameter) { auto builder = HloComputation::Builder(TestName()); HloInstruction* x = builder.AddInstruction( HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {4}), "x")); - HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary( - ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x)); + HloInstruction* bitcast = builder.AddInstruction( + HloInstruction::CreateBitcast(ShapeUtil::MakeShape(F32, {2, 2}), x)); auto module = CreateNewVerifiedModule(); module->AddEntryComputation(builder.Build()); @@ -258,8 +258,9 @@ TEST_F(CopyInsertionTest, BitcastConstant) { HloInstruction* constant = builder.AddInstruction(HloInstruction::CreateConstant( LiteralUtil::CreateR1({1.0, 42.0}))); - HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary( - ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, constant)); + HloInstruction* bitcast = + builder.AddInstruction(HloInstruction::CreateBitcast( + ShapeUtil::MakeShape(F32, {2, 2}), constant)); auto module = CreateNewVerifiedModule(); module->AddEntryComputation(builder.Build()); @@ -279,8 +280,8 @@ TEST_F(CopyInsertionTest, BitcastTupleElementParameter) { auto builder = HloComputation::Builder(TestName()); HloInstruction* x = builder.AddInstruction( HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {4}), "x")); - HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary( - ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x)); + HloInstruction* bitcast = builder.AddInstruction( + HloInstruction::CreateBitcast(ShapeUtil::MakeShape(F32, {2, 2}), x)); builder.AddInstruction(HloInstruction::CreateTuple({bitcast})); auto module = CreateNewVerifiedModule(); diff --git a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc index 0c020daec30..1ef007cc817 100644 --- a/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc +++ b/tensorflow/compiler/xla/service/hlo_alias_analysis_test.cc @@ -1008,8 +1008,8 @@ TEST_F(HloAliasAnalysisTest, Bitcast) { auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); - auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary( - scalar_shape_, HloOpcode::kBitcast, constant)); + auto bitcast = builder.AddInstruction( + HloInstruction::CreateBitcast(scalar_shape_, constant)); module_->AddEntryComputation(builder.Build()); SCOPED_TRACE(module_->ToString()); @@ -1076,8 +1076,8 @@ TEST_F(HloAliasAnalysisTest, BitcastInterference) { auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); - auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary( - scalar_shape_, HloOpcode::kBitcast, constant)); + auto bitcast = builder.AddInstruction( + HloInstruction::CreateBitcast(scalar_shape_, constant)); builder.AddInstruction(HloInstruction::CreateTuple({constant, bitcast})); module_->AddEntryComputation(builder.Build()); diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc index 407dfe796d8..ed4bac22a9f 100644 --- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc +++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc @@ -1105,8 +1105,8 @@ TEST_P(HloDataflowAnalysisTest, BitcastDefinesValue) { auto builder = HloComputation::Builder(TestName()); auto constant = builder.AddInstruction( HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); - auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary( - scalar_shape_, HloOpcode::kBitcast, constant)); + auto bitcast = builder.AddInstruction( + HloInstruction::CreateBitcast(scalar_shape_, constant)); module_->AddEntryComputation(builder.Build()); SCOPED_TRACE(module_->ToString()); diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc index 046ffde7616..7d5a3b6623f 100644 --- a/tensorflow/compiler/xla/service/layout_assignment_test.cc +++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc @@ -819,8 +819,8 @@ TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) { auto constant0 = builder.AddInstruction( HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout( {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1})))); - builder.AddInstruction(HloInstruction::CreateUnary( - constant0->shape(), HloOpcode::kBitcast, constant0)); + builder.AddInstruction( + HloInstruction::CreateBitcast(constant0->shape(), constant0)); auto m = CreateNewVerifiedModule(); m->AddEntryComputation(builder.Build()); diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc index d0515fb5825..be7ad99aac4 100644 --- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc +++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc @@ -564,8 +564,8 @@ TEST_F(TuplePointsToAnalysisTest, TupleWithBitcast) { HloInstruction::CreateConstant(LiteralUtil::CreateR0(1.0))); auto constant2 = builder.AddInstruction( HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.0))); - auto bitcast = builder.AddInstruction(HloInstruction::CreateUnary( - constant2->shape(), HloOpcode::kBitcast, constant2)); + auto bitcast = builder.AddInstruction( + HloInstruction::CreateBitcast(constant2->shape(), constant2)); auto tuple = builder.AddInstruction(HloInstruction::CreateTuple({constant1, bitcast})); From 6e92ec7e92ac20b0a89ccd83b0de0675b1d28e4b Mon Sep 17 00:00:00 2001 From: Haoliang Zhang Date: Fri, 19 Jul 2019 16:55:08 -0700 Subject: [PATCH 0207/3053] [FIX] In Reshape op's verify, only get tensor element count when it has a static shape. PiperOrigin-RevId: 259063883 --- tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc | 2 +- tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc index e39a6768ea4..f01306fe259 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc @@ -504,7 +504,7 @@ static LogicalResult Verify(ReshapeOp op) { auto rankByShape = shapeType.getShape()[0]; auto typeOfTensor = op.tensor()->getType().cast(); // No compile time verification for unknown sized shape. - if (rankByShape == -1 || !typeOfTensor.hasRank()) return success(); + if (rankByShape == -1 || !typeOfTensor.hasStaticShape()) return success(); // Check values if constant shape. No compiling time verification for // non-constant shape. auto *shapeOp = op.shape()->getDefiningOp(); diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir index 3b21c528c90..53b773f959d 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir +++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir @@ -190,6 +190,14 @@ func @testReshape(%arg0: tensor<10x10x10xf32>) -> tensor<100x100xf32> { return %r1 : tensor<100x100xf32> } +// ----- +// tf.Reshape with a first operand that has non-static shape. +func @testReshape(%arg0: tensor<10x10x?xf32>) -> tensor<10x10xf32> { + %shape1 = constant dense<[10, 10]> : tensor<2xi32> + %r1 = "tf.Reshape" (%arg0, %shape1) : (tensor<10x10x?xf32>, tensor<2xi32>) -> (tensor<10x10xf32>) + return %r1 : tensor<10x10xf32> +} + // ----- // CHECK-LABEL: func @testValidAvgPool From 4b0a8059921d484e52de8b90a16ee356565ee195 Mon Sep 17 00:00:00 2001 From: Nupur Garg Date: Fri, 19 Jul 2019 17:02:46 -0700 Subject: [PATCH 0208/3053] Make TFLiteConverter build with MLIR internally by default. PiperOrigin-RevId: 259064943 --- tensorflow/lite/python/lite_mlir_test.py | 80 +++++------------------- tensorflow/lite/toco/python/BUILD | 6 +- tensorflow/tensorflow.bzl | 3 + 3 files changed, 20 insertions(+), 69 deletions(-) diff --git a/tensorflow/lite/python/lite_mlir_test.py b/tensorflow/lite/python/lite_mlir_test.py index 98c0a5fe36e..f234eaf2301 100644 --- a/tensorflow/lite/python/lite_mlir_test.py +++ b/tensorflow/lite/python/lite_mlir_test.py @@ -40,28 +40,6 @@ from tensorflow.python.platform import test from tensorflow.python.training.tracking import tracking -def mlir_convert_and_check_for_unsupported(test_object, converter): - """Run the converter but don't fail MLIR was not built. - - Args: - test_object: PyTest object. - converter: A TFLiteConverter - - Returns: - The converted TF lite model or None if mlir support is not builtinto the - binary. - """ - try: - model = converter.convert() - test_object.assertTrue(model) - return model - except lite.ConverterError as e: - if not e.message.startswith('This flag is not supported by this version'): - raise e - else: - return None - - @test_util.run_v1_only('Incompatible with 2.0.') class FromSessionTest(test_util.TensorFlowTestCase): @@ -75,9 +53,7 @@ class FromSessionTest(test_util.TensorFlowTestCase): converter = lite.TFLiteConverter.from_session(sess, [in_tensor], [out_tensor]) converter.experimental_enable_mlir_converter = True - tflite_model = mlir_convert_and_check_for_unsupported(self, converter) - if tflite_model is None: - return + tflite_model = converter.convert() # Check values from converted model. interpreter = Interpreter(model_content=tflite_model) @@ -105,9 +81,7 @@ class FromSessionTest(test_util.TensorFlowTestCase): # Convert model and ensure model is not None. converter = lite.TFLiteConverter.from_session(sess, [in_tensor], [out_tensor]) - tflite_model = mlir_convert_and_check_for_unsupported(self, converter) - if tflite_model is None: - return + tflite_model = converter.convert() # Check values from converted model. interpreter = Interpreter(model_content=tflite_model) @@ -144,9 +118,7 @@ class FromSessionTest(test_util.TensorFlowTestCase): 'inputA': (0., 1.), 'inputB': (0., 1.) } # mean, std_dev - tflite_model = mlir_convert_and_check_for_unsupported(self, converter) - if tflite_model is None: - return + tflite_model = converter.convert() # Check values from converted model. interpreter = Interpreter(model_content=tflite_model) @@ -182,9 +154,7 @@ class FromSessionTest(test_util.TensorFlowTestCase): # Test conversion with the scalar input shape. converter = lite.TFLiteConverter.from_session(sess, [in_tensor], [out_tensor]) - tflite_model = mlir_convert_and_check_for_unsupported(self, converter) - if tflite_model is None: - return + tflite_model = converter.convert() # Check values from converted model. interpreter = Interpreter(model_content=tflite_model) @@ -228,18 +198,13 @@ class FromSessionTest(test_util.TensorFlowTestCase): # Convert float model. float_converter = lite.TFLiteConverter.from_session(sess, [in_tensor_1], [out_tensor]) - float_tflite = mlir_convert_and_check_for_unsupported(self, float_converter) - if float_tflite is None: - return + float_tflite = float_converter.convert() # Convert quantized weights model. quantized_converter = lite.TFLiteConverter.from_session( sess, [in_tensor_1], [out_tensor]) quantized_converter.optimizations = [lite.Optimize.DEFAULT] - quantized_tflite = mlir_convert_and_check_for_unsupported( - self, quantized_converter) - if quantized_tflite is None: - return + quantized_tflite = quantized_converter.convert() # Ensure that the quantized weights tflite model is smaller. self.assertLess(len(quantized_tflite), len(float_tflite)) @@ -266,9 +231,7 @@ class FromSessionTest(test_util.TensorFlowTestCase): # Convert model and ensure model is not None. converter = lite.TFLiteConverter.from_session(sess, [placeholder], [output_node]) - tflite_model = mlir_convert_and_check_for_unsupported(self, converter) - if tflite_model is None: - return + tflite_model = converter.convert() # Check values from converted model. interpreter = Interpreter(model_content=tflite_model) @@ -322,9 +285,7 @@ class FromConcreteFunctionTest(test_util.TensorFlowTestCase): # Convert model. converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func]) converter.experimental_enable_mlir_converter = True - tflite_model = mlir_convert_and_check_for_unsupported(self, converter) - if tflite_model is None: - return + tflite_model = converter.convert() # Check values from converted model. expected_value = root.f(input_data) @@ -359,9 +320,7 @@ class FromConcreteFunctionTest(test_util.TensorFlowTestCase): # Convert model. converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func]) converter.experimental_enable_mlir_converter = True - tflite_model = mlir_convert_and_check_for_unsupported(self, converter) - if tflite_model is None: - return + tflite_model = converter.convert() # Check values from converted model. expected_value = concrete_func(**input_data) @@ -389,9 +348,7 @@ class FromConcreteFunctionTest(test_util.TensorFlowTestCase): # Convert model. converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func]) converter.experimental_enable_mlir_converter = True - tflite_model = mlir_convert_and_check_for_unsupported(self, converter) - if tflite_model is None: - return + tflite_model = converter.convert() # Check values from converted model. expected_value = concrete_func(input_data)[0] @@ -422,9 +379,7 @@ class FromConcreteFunctionTest(test_util.TensorFlowTestCase): # Convert model. converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func]) converter.experimental_enable_mlir_converter = True - tflite_model = mlir_convert_and_check_for_unsupported(self, converter) - if tflite_model is None: - return + tflite_model = converter.convert() # Check values from converted model. expected_value = concrete_func(input_data) @@ -449,9 +404,7 @@ class FromConcreteFunctionTest(test_util.TensorFlowTestCase): # Convert model. converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func]) converter.experimental_enable_mlir_converter = True - tflite_model = mlir_convert_and_check_for_unsupported(self, converter) - if tflite_model is None: - return + tflite_model = converter.convert() # Check values from converted model. expected_value = concrete_func(input_data) @@ -478,9 +431,7 @@ class TestFlexMode(test_util.TensorFlowTestCase): [out_tensor]) converter.experimental_enable_mlir_converter = True converter.target_spec.supported_ops = set([lite.OpsSet.SELECT_TF_OPS]) - tflite_model = mlir_convert_and_check_for_unsupported(self, converter) - if tflite_model is None: - return + tflite_model = converter.convert() # Ensures the model contains TensorFlow ops. # TODO(nupurgarg): Check values once there is a Python delegate interface. @@ -505,10 +456,7 @@ class TestFlexMode(test_util.TensorFlowTestCase): converter = lite.TFLiteConverterV2.from_concrete_functions([concrete_func]) converter.experimental_enable_mlir_converter = True converter.target_spec.supported_ops = set([lite.OpsSet.SELECT_TF_OPS]) - - tflite_model = mlir_convert_and_check_for_unsupported(self, converter) - if tflite_model is None: - return + tflite_model = converter.convert() # Ensures the model contains TensorFlow ops. # TODO(nupurgarg): Check values once there is a Python delegate interface. diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD index 1f4e86f85c8..79357f66676 100644 --- a/tensorflow/lite/toco/python/BUILD +++ b/tensorflow/lite/toco/python/BUILD @@ -1,5 +1,5 @@ load("//tensorflow:tensorflow.bzl", "tf_py_wrap_cc") -load("//tensorflow:tensorflow.bzl", "if_mlir", "py_binary", "tf_py_test") +load("//tensorflow:tensorflow.bzl", "if_mlir_tflite", "py_binary", "tf_py_test") package( default_visibility = [ @@ -22,7 +22,7 @@ cc_library( name = "toco_python_api", srcs = ["toco_python_api.cc"], hdrs = ["toco_python_api.h"], - defines = if_mlir( + defines = if_mlir_tflite( if_false = [], if_true = ["TFLITE_BUILD_WITH_MLIR_CONVERTER"], ), @@ -46,7 +46,7 @@ cc_library( "//tensorflow/core:ops", ], "//conditions:default": [], - }) + if_mlir( + }) + if_mlir_tflite( if_false = [], if_true = ["//tensorflow/compiler/mlir/lite/python:graphdef_to_tfl_flatbuffer"], ), diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 5d9aba8637a..d253d5b8799 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -2493,5 +2493,8 @@ def if_mlir(if_true, if_false = []): "//tensorflow:with_mlir_support": if_true, }) +def if_mlir_tflite(if_true, if_false = []): + return if_mlir(if_true, if_false) + def tfcompile_extra_flags(): return "" From e54190d1ce02f400b13b002c00258fc82b9f0c1c Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Fri, 19 Jul 2019 17:11:27 -0700 Subject: [PATCH 0209/3053] [XLA GPU] Minor kernel_tiling.h cleanup: use std::array<3> in place of a vector To indicate that a member variable only has three elements stored. PiperOrigin-RevId: 259065985 --- .../xla/service/llvm_ir/kernel_tiling.cc | 29 ++++++++++--------- .../xla/service/llvm_ir/kernel_tiling.h | 18 +++++------- tensorflow/compiler/xla/util.h | 13 --------- 3 files changed, 23 insertions(+), 37 deletions(-) diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc index 2ef844ffa62..2f131289377 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc +++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc @@ -54,6 +54,15 @@ Shape MergeDimensions(absl::Span segs, const Shape& shape) { dimensions); } +std::array ElementWiseCeilOfRatio(std::array dividends, + std::array divisors) { + std::array out; + for (int i = 0; i < 3; i++) { + out[i] = CeilOfRatio(dividends.at(i), divisors.at(i)); + } + return out; +} + } // namespace absl::optional > FindTranspose021(const Shape& a, @@ -99,26 +108,20 @@ KernelMappingScheme::KernelMappingScheme( absl::Span req_block_sizes, int64 num_threads_y, int64 num_threads_x, llvm::IRBuilder<>* b) : b_(b), - dims_in_elems_(dims_in_elems.begin(), dims_in_elems.end()), + dims_in_elems_{dims_in_elems.at(0), dims_in_elems.at(1), + dims_in_elems.at(2)}, tile_sizes_{1, tile_size_y, tile_size_x}, + dims_in_tiles_(ElementWiseCeilOfRatio(dims_in_elems_, tile_sizes_)), + block_sizes_{std::min(req_block_sizes.at(0), dims_in_tiles_.at(0)), + std::min(req_block_sizes.at(1), dims_in_tiles_.at(1)), + std::min(req_block_sizes.at(2), dims_in_tiles_.at(2))}, + dims_in_blocks_(ElementWiseCeilOfRatio(dims_in_tiles_, block_sizes_)), num_threads_x_(num_threads_x), num_threads_y_(num_threads_y), dilated_x_(true) { - DCHECK_EQ(dims_in_elems_.size(), 3); DCHECK_EQ(req_block_sizes.size(), 3); - DCHECK_EQ(tile_size_y % num_threads_y_, 0); DCHECK_EQ(tile_size_x % num_threads_x_, 0); - - dims_in_tiles_ = ElementWiseCeilOfRatio(dims_in_elems_, tile_sizes_); - block_sizes_.reserve(req_block_sizes.size()); - absl::c_transform(req_block_sizes, dims_in_tiles_, - std::back_inserter(block_sizes_), - [](const int64 requested_size, const int64 max_size) { - return std::min(requested_size, max_size); - }); - dims_in_blocks_ = ElementWiseCeilOfRatio(dims_in_tiles_, block_sizes_); - VLOG(10) << "dims_in_elems_ = [" << absl::StrJoin(dims_in_elems_, ",") << "]"; VLOG(10) << "dims_in_tiles_ = [" << absl::StrJoin(dims_in_tiles_, ",") << "]"; VLOG(10) << "dims_in_blocks_ = [" << absl::StrJoin(dims_in_blocks_, ",") diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h index f802cc27d51..80f42214d33 100644 --- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h +++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h @@ -125,10 +125,7 @@ class KernelMappingScheme { return absl::c_accumulate(dims_in_blocks_, 1, std::multiplies()); } - int64 GetTileSizeForDimension(int d) const { - DCHECK(d >= DimZ && d <= DimX); - return tile_sizes_[d]; - } + int64 GetTileSizeForDimension(int d) const { return tile_sizes_.at(d); } int64 GetTileSizeForDimensionX() const { return GetTileSizeForDimension(DimX); } @@ -138,8 +135,7 @@ class KernelMappingScheme { absl::Span GetBlockSizes() const { return block_sizes_; } int64 GetTileBlockSizeForDimension(int d) const { - DCHECK(d >= DimZ && d <= DimX); - return dims_in_blocks_[d]; + return dims_in_blocks_.at(d); } int64 GetNumberOfThreadsForDimensionX() const { return num_threads_x_; } @@ -181,19 +177,19 @@ class KernelMappingScheme { private: llvm::IRBuilder<>* b_; // The number of elements in each dimension. - std::vector dims_in_elems_; + std::array dims_in_elems_; // The number of elements for each dimension of a tile. - std::vector tile_sizes_; + std::array tile_sizes_; // The number of tiles in each dimension. It is computed from dims_in_elem_ // and tile_sizes_. - std::vector dims_in_tiles_; + std::array dims_in_tiles_; // The number of tiles for each dimension of a tile block. - std::vector block_sizes_; + std::array block_sizes_; // The number of blocks in each dimension of a tile block. It is computed from // dims_in_tile_ and block_sizes_. - std::vector dims_in_blocks_; + std::array dims_in_blocks_; // Number of threads used to process elements in the X direction of a tile. int64 num_threads_x_; diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h index dacb5faa228..06ea42235b2 100644 --- a/tensorflow/compiler/xla/util.h +++ b/tensorflow/compiler/xla/util.h @@ -424,19 +424,6 @@ T CeilOfRatio(T dividend, T divisor) { return tensorflow::MathUtil::CeilOfRatio(dividend, divisor); } -template -std::vector ElementWiseCeilOfRatio(absl::Span dividends, - absl::Span divisors) { - std::vector ceil_of_ratios; - CHECK_EQ(dividends.size(), divisors.size()); - ceil_of_ratios.reserve(dividends.size()); - absl::c_transform(dividends, divisors, std::back_inserter(ceil_of_ratios), - [](const T dividend, const T divisor) { - return CeilOfRatio(dividend, divisor); - }); - return ceil_of_ratios; -} - // Rounds the value up to a multiple of the divisor by first calling CeilOfRatio // then multiplying by the divisor. For example: RoundUpToNearest(13, 8) => 16 template From 4969bcf6ec066cf9448c355ea766be4e47b19421 Mon Sep 17 00:00:00 2001 From: Yanan Cao Date: Fri, 19 Jul 2019 17:16:10 -0700 Subject: [PATCH 0210/3053] Lift outside compilation only arguments from functional If nodes. PiperOrigin-RevId: 259066480 --- .../jit/extract_outside_compilation_pass.cc | 110 +++++++++++++++++- 1 file changed, 107 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc index 8935cdfc240..4be94666fc4 100644 --- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc +++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc @@ -523,7 +523,8 @@ xla::StatusOr> UpdateTypesAttribute( // Add edges from lifted outside compilation argument nodes to `n` in Graph `g`. void AddEdgesFromOutsideCompilationNodes( - const int original_arg_count, const std::vector& data_types, + const int original_arg_count, const int arg_to_input_edge_offset, + const std::vector& data_types, const std::vector>& lifted_arg_nodes_and_outside_compilation_nodes, Graph* g, Node* n) { @@ -532,7 +533,7 @@ void AddEdgesFromOutsideCompilationNodes( Node* outside_compilation_node = lifted_arg_nodes_and_outside_compilation_nodes[i - original_arg_count] .second; - g->AddEdge(outside_compilation_node, 0, n, i); + g->AddEdge(outside_compilation_node, 0, n, i + arg_to_input_edge_offset); } } @@ -630,7 +631,8 @@ Status PostprocessLiftedArgsForWhile( // Add edges from outside compilation nodes to While node. AddEdgesFromOutsideCompilationNodes( - original_arg_count, data_types, + original_arg_count, + /*arg_to_input_edge_offset=*/0, data_types, lifted_arg_nodes_and_outside_compilation_nodes, g, n); // In body_graph, create new _Arg/_Retval nodes, and replace lifted arg @@ -682,6 +684,103 @@ Status PostprocessLiftedArgsForWhile( return Status::OK(); } +Status PostprocessLiftedArgsForIf( + const std::unordered_map& outside_compilation_attr_to_node, + Graph* g, Node* n, FunctionLibraryDefinition* fld) { + TF_RET_CHECK(n->type_string() == "If"); + + NameAttrList then_branch_func; + TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "then_branch", &then_branch_func)); + const FunctionDef* then_branch_function_def = + fld->Find(then_branch_func.name()); + TF_RET_CHECK(then_branch_function_def); + + NameAttrList else_branch_func; + TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "else_branch", &else_branch_func)); + const FunctionDef* else_branch_function_def = + fld->Find(else_branch_func.name()); + TF_RET_CHECK(else_branch_function_def); + + // Nothing to do if neither branch contains any lifted arguments. + if (!HasLiftedArgs(*then_branch_function_def) && + !HasLiftedArgs(*else_branch_function_def)) { + return Status::OK(); + } + + std::unique_ptr then_branch_function_body; + TF_RETURN_IF_ERROR(FunctionDefToBodyHelper( + *then_branch_function_def, AttrSlice(&then_branch_func.attr()), fld, + &then_branch_function_body)); + + std::unique_ptr else_branch_function_body; + TF_RETURN_IF_ERROR(FunctionDefToBodyHelper( + *else_branch_function_def, AttrSlice(&else_branch_func.attr()), fld, + &else_branch_function_body)); + + // Then and else branches have same argument count and argument data types. + int original_arg_count = then_branch_function_body->arg_nodes.size(); + + TF_ASSIGN_OR_RETURN( + auto then_branch_lifted_arg_nodes_and_outside_compilation_nodes, + LiftedArgsAndOutsideCompilationNodesInFunctionBody( + *then_branch_function_body, outside_compilation_attr_to_node)); + + TF_ASSIGN_OR_RETURN( + auto else_branch_lifted_arg_nodes_and_outside_compilation_nodes, + LiftedArgsAndOutsideCompilationNodesInFunctionBody( + *else_branch_function_body, outside_compilation_attr_to_node)); + + // Append lifted args' types to If node's Tin attribute. + TF_ASSIGN_OR_RETURN( + std::vector data_types, + UpdateTypesAttribute( + then_branch_lifted_arg_nodes_and_outside_compilation_nodes, "Tin", + n)); + + // Add edges from outside compilation nodes to If node. If node's input #0 + // is predicate input, input #1 maps to _Arg #0 of branch functions, thus + // arg_to_input_edge_offset is set to 1. + AddEdgesFromOutsideCompilationNodes( + original_arg_count, + /*arg_to_input_edge_offset=*/1, data_types, + then_branch_lifted_arg_nodes_and_outside_compilation_nodes, g, n); + + for (int i = original_arg_count; i < data_types.size(); ++i) { + TF_ASSIGN_OR_RETURN(Node * then_branch_arg_node, + AddOutsideCompilationInputArgToFunctionBody( + *then_branch_function_body, i, data_types[i])); + + ReplaceLiftedArgNodePlaceholderWithArg( + *then_branch_function_body, original_arg_count, i, + then_branch_lifted_arg_nodes_and_outside_compilation_nodes, + then_branch_arg_node); + + TF_ASSIGN_OR_RETURN(Node * else_branch_arg_node, + AddOutsideCompilationInputArgToFunctionBody( + *else_branch_function_body, i, data_types[i])); + + ReplaceLiftedArgNodePlaceholderWithArg( + *else_branch_function_body, original_arg_count, i, + else_branch_lifted_arg_nodes_and_outside_compilation_nodes, + else_branch_arg_node); + } + + FunctionDef rewritten_then_branch_function_def; + TF_RETURN_IF_ERROR(GraphToFunctionDef( + *then_branch_function_body->graph, then_branch_func.name(), + HostGraphControlRetMapping, &rewritten_then_branch_function_def)); + TF_RETURN_IF_ERROR(fld->ReplaceFunction(then_branch_func.name(), + rewritten_then_branch_function_def)); + + FunctionDef rewritten_else_branch_function_def; + TF_RETURN_IF_ERROR(GraphToFunctionDef( + *else_branch_function_body->graph, else_branch_func.name(), + HostGraphControlRetMapping, &rewritten_else_branch_function_def)); + TF_RETURN_IF_ERROR(fld->ReplaceFunction(else_branch_func.name(), + rewritten_else_branch_function_def)); + return Status::OK(); +} + // Creates a mapping from outside compilation cluster name to lifted argument // placeholder. xla::StatusOr> OutsideCompilationAttrToNode( @@ -716,6 +815,11 @@ Status PostprocessLiftedArgs(Graph* g, FunctionLibraryDefinition* fld) { TF_RETURN_IF_ERROR(PostprocessLiftedArgsForWhile( outside_compilation_attr_to_node, g, n, fld)); } + + if (n->type_string() == "If") { + TF_RETURN_IF_ERROR(PostprocessLiftedArgsForIf( + outside_compilation_attr_to_node, g, n, fld)); + } } return Status::OK(); From 182b425b1648c28bf29178a70ae8aef3b57def69 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 19 Jul 2019 17:19:54 -0700 Subject: [PATCH 0211/3053] Falls back from Keras v2 loop when the user supplies an unsupported data type. PiperOrigin-RevId: 259066843 --- .../python/keras/engine/data_adapter.py | 30 +++++++++++++++---- tensorflow/python/keras/engine/training.py | 10 ++++++- .../python/keras/utils/io_utils_test.py | 10 +++++-- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py index 28e52b4241e..87815772bd9 100644 --- a/tensorflow/python/keras/engine/data_adapter.py +++ b/tensorflow/python/keras/engine/data_adapter.py @@ -339,17 +339,37 @@ ALL_ADAPTER_CLS = [ def select_data_adapter(x, y): + """Selects a data adapter than can handle a given x and y.""" adapter_cls = [cls for cls in ALL_ADAPTER_CLS if cls.can_handle(x, y)] if not adapter_cls: - raise ValueError("Failed to find data adapter that can handle " - "input: {}, {}".format(type(x), type(y))) + # TODO(scottzhu): This should be a less implementation-specific error. + raise ValueError( + "Failed to find data adapter that can handle " + "input: {}, {}".format( + _type_name(x), _type_name(y))) elif len(adapter_cls) > 1: - raise RuntimeError("Data adapter should be mutually exclusive for " - "handling inputs. Found multiple adapter {} to handle " - "input: {}, {}".format(adapter_cls, type(x), type(y))) + raise RuntimeError( + "Data adapters should be mutually exclusive for " + "handling inputs. Found multiple adapters {} to handle " + "input: {}, {}".format( + adapter_cls, _type_name(x), _type_name(y))) return adapter_cls[0] +def _type_name(x): + """Generates a description of the type of an object.""" + if isinstance(x, dict): + key_types = set(_type_name(key) for key in x.keys()) + val_types = set(_type_name(key) for key in x.values()) + return "({} containing {} keys and {} values)".format( + type(x), key_types, val_types) + if isinstance(x, (list, tuple)): + types = set(_type_name(val) for val in x) + return "({} containing values of types {})".format( + type(x), types) + return str(type(x)) + + def _process_numpy_inputs(inputs): """Process numpy array inputs. diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py index c4e3378c182..1fefa5744cd 100644 --- a/tensorflow/python/keras/engine/training.py +++ b/tensorflow/python/keras/engine/training.py @@ -43,6 +43,7 @@ from tensorflow.python.keras import losses from tensorflow.python.keras import metrics as metrics_module from tensorflow.python.keras import optimizers from tensorflow.python.keras.distribute import distributed_training_utils +from tensorflow.python.keras.engine import data_adapter from tensorflow.python.keras.engine import network from tensorflow.python.keras.engine import training_arrays from tensorflow.python.keras.engine import training_distributed @@ -473,7 +474,14 @@ class Model(network.Network): and not isinstance(inputs, (data_utils.Sequence)) and not distributed_training_utils.is_tpu_strategy( self._distribution_strategy)): - return training_v2.Loop() + try: + valid_adapter = data_adapter.select_data_adapter(inputs, None) + except ValueError as data_failure_exception: + valid_adapter = None + logging.warning('Falling back from v2 loop because of error: ' + '%s' % data_failure_exception) + if valid_adapter: + return training_v2.Loop() # Case 1: distribution strategy. if self._distribution_strategy: diff --git a/tensorflow/python/keras/utils/io_utils_test.py b/tensorflow/python/keras/utils/io_utils_test.py index b2801de56fa..30e59f9db65 100644 --- a/tensorflow/python/keras/utils/io_utils_test.py +++ b/tensorflow/python/keras/utils/io_utils_test.py @@ -25,6 +25,8 @@ import numpy as np import six from tensorflow.python import keras +from tensorflow.python.keras import keras_parameterized +from tensorflow.python.keras import testing_utils from tensorflow.python.keras.utils import io_utils from tensorflow.python.platform import test @@ -47,8 +49,10 @@ def create_dataset(h5_path='test.h5'): f.close() -class TestIOUtils(test.TestCase): +class TestIOUtils(keras_parameterized.TestCase): + # TODO(b/137965102): eventually support this in eager + the v2 loops + @keras_parameterized.run_all_keras_modes(always_skip_eager=True) def test_HDF5Matrix(self): if h5py is None: return @@ -80,7 +84,9 @@ class TestIOUtils(test.TestCase): model = keras.models.Sequential() model.add(keras.layers.Dense(64, input_shape=(10,), activation='relu')) model.add(keras.layers.Dense(1, activation='sigmoid')) - model.compile(loss='binary_crossentropy', optimizer='sgd') + model.compile(loss='binary_crossentropy', optimizer='sgd', + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) # Note: you have to use shuffle='batch' or False with HDF5Matrix model.fit(x_train, y_train, batch_size=32, shuffle='batch', verbose=False) From 8faa8bc20eae5f2b53a35dea7a56501d4e371870 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 19 Jul 2019 17:27:24 -0700 Subject: [PATCH 0212/3053] [tracing] unify internal and external ScopedAnnotation implementation. PiperOrigin-RevId: 259067560 --- tensorflow/core/BUILD | 15 ++ tensorflow/core/common_runtime/executor.cc | 9 +- tensorflow/core/platform/annotation.h | 145 ++++++++++++++++++ .../core/platform/default/device_tracer.cc | 72 ++------- tensorflow/core/platform/tracing.cc | 21 +-- tensorflow/core/platform/tracing.h | 62 +------- tensorflow/core/profiler/internal/BUILD | 2 - .../internal/scoped_annotation_test.cc | 83 ++++++++-- 8 files changed, 245 insertions(+), 164 deletions(-) create mode 100644 tensorflow/core/platform/annotation.h diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index ca158b3486b..3b16fd92faa 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -179,6 +179,7 @@ package_group( name = "dependency_whitelist", packages = [ "//learning/freud/topic_models/tensorflow/...", + "//perftools/accelerators/xprof/api/...", "//quality/webanswers/brain/tokenization/custom_tf_ops/kernels/...", ], ) @@ -2451,6 +2452,7 @@ LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [ "lib/strings/proto_serialization.h", "lib/strings/scanner.h", "lib/wav/wav_io.h", + "platform/annotation.h", "platform/demangle.h", "platform/denormal.h", "platform/host_info.h", @@ -2464,6 +2466,19 @@ LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [ "util/env_var.h", ] +cc_library( + name = "annotation", + srcs = [], + hdrs = [ + "platform/annotation.h", + ], + copts = tf_copts(), + visibility = ["//visibility:public"], + deps = [ + "@com_google_absl//absl/strings", + ], +) + # Replicated for lib_internal and lib_internal_impl. LIB_INTERNAL_DEFINES = ( tf_additional_lib_defines() + [ diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc index 065a6782811..bc0609e04e2 100644 --- a/tensorflow/core/common_runtime/executor.cc +++ b/tensorflow/core/common_runtime/executor.cc @@ -1612,12 +1612,9 @@ bool MightTrace(const NodeItem& item, if (event_collector != nullptr) { return true; } - auto* trace_collector = tracing::GetTraceCollector(); - if (trace_collector) { - if (using_annotations && trace_collector->IsEnabledForAnnotations()) { - return true; - } - } + + if (using_annotations && tracing::ScopedAnnotation::IsEnabled()) return true; + return profiler::TraceMeRecorder::Active( profiler::GetTFTraceMeLevel(item.kernel->IsExpensive())); } diff --git a/tensorflow/core/platform/annotation.h b/tensorflow/core/platform/annotation.h new file mode 100644 index 00000000000..660767eec25 --- /dev/null +++ b/tensorflow/core/platform/annotation.h @@ -0,0 +1,145 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CORE_PLATFORM_ANNOTATION_H_ +#define TENSORFLOW_CORE_PLATFORM_ANNOTATION_H_ + +#include + +#include +#include + +#include "absl/strings/str_cat.h" +#include "absl/strings/string_view.h" +#include "tensorflow/core/platform/macros.h" + +namespace tensorflow { + +// Backend for ScopedAnnotation. +class Annotation { + public: + // Appends name to the annotation for the current thread and returns the + // original length of the annotation. + // Append name to the current annotation, separated by "::". + // The choice of separator "::" is based on characters not used by + // TensorFlow for its TensorOps. + static size_t PushAnnotation(absl::string_view name) { + std::string* annotation = ThreadAnnotation(); + size_t old_length = annotation->size(); + if (old_length != 0) { + absl::StrAppend(annotation, "::", name); + } else { + *annotation = std::string(name); + } + return old_length; + } + + static size_t PushAnnotation(std::string&& name) { + std::string* annotation = ThreadAnnotation(); + size_t old_length = annotation->size(); + if (old_length != 0) { + absl::StrAppend(annotation, "::", name); + } else { + *annotation = std::move(name); + } + return old_length; + } + + // Returns the annotation for the current thread. + static const std::string& CurrentAnnotation() { return *ThreadAnnotation(); } + + // Resizes the annotation for the current thread to its old length. + static void PopAnnotation(size_t old_length) { + ThreadAnnotation()->resize(old_length); + } + + private: + Annotation(const Annotation&) = delete; // Unconstructible. + + // Returns a reference to the annotation for the current thread. + static std::string* ThreadAnnotation() { + static thread_local std::string annotation; + return &annotation; + } +}; + +namespace tracing { +// Adds an annotation to all activities for the duration of the instance +// lifetime through the currently registered TraceCollector. +// +// Usage: { +// ScopedAnnotation annotation("my kernels"); +// Kernel1<<>>; +// LaunchKernel2(); // Launches a CUDA kernel. +// } +// This will add 'my kernels' to both kernels in the profiler UI +class ScopedAnnotation { + public: + explicit ScopedAnnotation(absl::string_view name) { + if (TF_PREDICT_FALSE(IsEnabled())) { + old_length_ = Annotation::PushAnnotation(name); + } + } + + explicit ScopedAnnotation(const char* name) + : ScopedAnnotation(absl::string_view(name)) {} + + explicit ScopedAnnotation(const std::string& name) { + if (TF_PREDICT_FALSE(IsEnabled())) { + old_length_ = Annotation::PushAnnotation(name); + } + } + + explicit ScopedAnnotation(std::string&& name) { + if (TF_PREDICT_FALSE(IsEnabled())) { + old_length_ = Annotation::PushAnnotation(std::move(name)); + } + } + + template + explicit ScopedAnnotation(NameGeneratorT name_generator) { + if (TF_PREDICT_FALSE(IsEnabled())) { + old_length_ = Annotation::PushAnnotation(name_generator()); + } + } + + // Deprecated: use the lambda version if you want to concatenate strings as + // annotation on the fly. + ScopedAnnotation(absl::string_view name_part1, absl::string_view name_part2) + : ScopedAnnotation( + [&]() { return StrCat(name_part1, ":", name_part2); }) {} + + // Pops the name passed in the constructor from the current annotation. + ~ScopedAnnotation() { + // TODO(b/137971921): without this memory fence, two presubmit tests will + // fail probably due to compiler in that presubmit config. + std::atomic_thread_fence(std::memory_order_acquire); + if (TF_PREDICT_FALSE(old_length_ != kInvalidLength)) { + Annotation::PopAnnotation(old_length_); + } + } + + static void Enable(bool enable); + static const bool IsEnabled(); + + private: + // signals that annotation is disabled at the constructor. + static constexpr size_t kInvalidLength = static_cast(-1); + size_t old_length_ = kInvalidLength; +}; + +} // namespace tracing +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_PLATFORM_ANNOTATION_H_ diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc index 04e6282edbe..27565a7f052 100644 --- a/tensorflow/core/platform/default/device_tracer.cc +++ b/tensorflow/core/platform/default/device_tracer.cc @@ -34,6 +34,7 @@ limitations under the License. #include "tensorflow/core/lib/strings/strcat.h" #include "tensorflow/core/lib/strings/stringprintf.h" #include "tensorflow/core/platform/abi.h" +#include "tensorflow/core/platform/annotation.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/mem.h" @@ -105,12 +106,6 @@ Status CreateAndRecordEvent(CUevent* event, CUstream stream) { return ToStatus(cuEventRecord(*event, stream)); } -// Thread-local state recording the most recent annotation (if any). -// When non-null, this points to a string in the active annotation -// of the current thread. The annotation is guaranteed to remain live -// for the duration of the CUPTI API callback. -static thread_local const char* tls_current_annotation; - // Stores a series of kernel and memcpy records. class CudaEventRecorder { public: @@ -121,8 +116,9 @@ class CudaEventRecorder { KernelRecord record = {kernel_name, context, stream}; LogIfError(CreateAndRecordEvent(&record.start_event, stream)); mutex_lock lock(mutex_); - if (tls_current_annotation) { - record.annotation = &*annotations_.emplace(tls_current_annotation).first; + if (tracing::ScopedAnnotation::IsEnabled()) { + record.annotation = + &*annotations_.emplace(Annotation::CurrentAnnotation()).first; } kernel_records_.push_back(record); return kernel_records_.size() - 1; @@ -140,8 +136,9 @@ class CudaEventRecorder { MemcpyRecord record = {src_type, dst_type, size_bytes, context, stream}; LogIfError(CreateAndRecordEvent(&record.start_event, stream)); mutex_lock lock(mutex_); - if (tls_current_annotation) { - record.annotation = &*annotations_.emplace(tls_current_annotation).first; + if (tracing::ScopedAnnotation::IsEnabled()) { + record.annotation = + &*annotations_.emplace(Annotation::CurrentAnnotation()).first; } memcpy_records_.push_back(record); return memcpy_records_.size() - 1; @@ -319,56 +316,6 @@ class CuptiCallbackHook { CUpti_SubscriberHandle subscriber_; }; -class TraceCollectorImpl : public tracing::TraceCollector { - public: - TraceCollectorImpl() : active_trace_session_(false) { - tracing::SetTraceCollector(this); - } - - ~TraceCollectorImpl() override { - DCHECK(!active_trace_session_) - << "Unexpected active trace session detected."; - } - - // Note the method can be called after a call to Stop(). - virtual std::unique_ptr CreateAnnotationHandle( - StringPiece name_part1, StringPiece name_part2) const { - struct Impl : public tracing::TraceCollector::Handle { - std::string annotation; - explicit Impl(std::string&& name_scope) : annotation(name_scope) { - VLOG(2) << "CreateAnnotationHandle " << annotation; - // Remember the most recent ScopedAnnotation for each thread. - tls_current_annotation = annotation.c_str(); - } - ~Impl() override { tls_current_annotation = nullptr; } - }; - return absl::make_unique(ConcatenateNames(name_part1, name_part2)); - } - - bool IsEnabledForAnnotations() const override { - return active_trace_session_.load(std::memory_order_relaxed); - } - - void Start() { - DCHECK(!active_trace_session_) - << "Unexpected active trace session detected."; - active_trace_session_ = true; - } - - void Stop() { - DCHECK(active_trace_session_) << "No active trace session detected. "; - active_trace_session_ = false; - } - - private: - std::atomic active_trace_session_; -}; - -TraceCollectorImpl* GlobalDefaultTraceCollector() { - static auto* instance = new TraceCollectorImpl(); - return instance; -} - // 'DeviceTracer' is an interface for collecting low-level execution timings // of hardware accelerator (e.g. GPU) computation and DMA transfers. class DeviceTracer : public profiler::ProfilerInterface { @@ -412,8 +359,7 @@ Status DeviceTracer::Start() { cupti_hook_.reset(new CuptiCallbackHook()); TF_RETURN_IF_ERROR(cupti_hook_->Enable(recorder_.get())); - // Register as a TraceEngine to receive ScopedAnnotations. - GlobalDefaultTraceCollector()->Start(); + tracing::ScopedAnnotation::Enable(true); enabled_ = true; return Status::OK(); @@ -426,7 +372,7 @@ Status DeviceTracer::Stop() { return Status::OK(); } cupti_hook_.reset(); - GlobalDefaultTraceCollector()->Stop(); + tracing::ScopedAnnotation::Enable(false); enabled_ = false; return Status::OK(); diff --git a/tensorflow/core/platform/tracing.cc b/tensorflow/core/platform/tracing.cc index c0386c0a3fc..ab8c3ec4ea5 100644 --- a/tensorflow/core/platform/tracing.cc +++ b/tensorflow/core/platform/tracing.cc @@ -29,7 +29,7 @@ namespace tensorflow { namespace tracing { namespace { std::atomic unique_arg{1}; -std::atomic trace_collector; +std::atomic enable_annotation; } // namespace const char* GetEventCategoryName(EventCategory category) { @@ -61,23 +61,12 @@ uint64 GetArgForName(StringPiece name) { return Hash64(name.data(), name.size()); } -string TraceCollector::ConcatenateNames(StringPiece first, StringPiece second) { - std::string result; - bool has_two_parts = !first.empty() && !second.empty(); - result.reserve(first.size() + second.size() + - static_cast(has_two_parts)); - result.append(first.data(), first.size()); - if (has_two_parts) result.append({':'}); - result.append(second.data(), second.size()); - return result; +void ScopedAnnotation::Enable(bool enable) { + return enable_annotation.store(enable, std::memory_order_release); } -void SetTraceCollector(const TraceCollector* collector) { - return trace_collector.store(collector, std::memory_order_release); -} - -const TraceCollector* GetTraceCollector() { - return trace_collector.load(std::memory_order_acquire); +const bool ScopedAnnotation::IsEnabled() { + return enable_annotation.load(std::memory_order_acquire); } } // namespace tracing diff --git a/tensorflow/core/platform/tracing.h b/tensorflow/core/platform/tracing.h index 9b2886f1c42..45d28f84f40 100644 --- a/tensorflow/core/platform/tracing.h +++ b/tensorflow/core/platform/tracing.h @@ -26,6 +26,7 @@ limitations under the License. #include "absl/memory/memory.h" #include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/strings/strcat.h" +#include "tensorflow/core/platform/annotation.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/platform.h" @@ -141,67 +142,6 @@ class ScopedRegion { const EventCollector* collector_; }; -// Interface for accelerator profiler annotations. -class TraceCollector { - public: - class Handle { - public: - virtual ~Handle() {} - }; - - virtual ~TraceCollector() {} - virtual std::unique_ptr CreateAnnotationHandle( - StringPiece name_part1, StringPiece name_part2) const = 0; - - // Returns true if this annotation tracing is enabled for any op. - virtual bool IsEnabledForAnnotations() const = 0; - - static string ConcatenateNames(StringPiece first, StringPiece second); - - private: - friend void SetTraceCollector(const TraceCollector*); - friend const TraceCollector* GetTraceCollector(); -}; -// Set the callback for ScopedAnnotation and ScopedActivity. -void SetTraceCollector(const TraceCollector* collector); -// Returns the callback for ScopedAnnotation and ScopedActivity. -const TraceCollector* GetTraceCollector(); - -// Adds an annotation to all activities for the duration of the instance -// lifetime through the currently registered TraceCollector. -// -// Usage: { -// ScopedAnnotation annotation("my kernels"); -// Kernel1<<>>; -// LaunchKernel2(); // Launches a CUDA kernel. -// } -// This will add 'my kernels' to both kernels in the profiler UI -class ScopedAnnotation { - public: - explicit ScopedAnnotation(StringPiece name) - : ScopedAnnotation(name, StringPiece()) {} - - // If tracing is enabled, add a name scope of - // ":". This can be cheaper than the - // single-argument constructor because the concatenation of the - // label string is only done if tracing is enabled. - ScopedAnnotation(StringPiece name_part1, StringPiece name_part2) - : handle_([&] { - auto trace_collector = GetTraceCollector(); - return trace_collector ? trace_collector->CreateAnnotationHandle( - name_part1, name_part2) - : nullptr; - }()) {} - - static bool IsEnabled() { - auto* trace_collector = GetTraceCollector(); - return trace_collector && trace_collector->IsEnabledForAnnotations(); - } - - private: - std::unique_ptr handle_; -}; - // Return the pathname of the directory where we are writing log files. const char* GetLogDir(); diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD index 15a9497890f..71a35425da0 100644 --- a/tensorflow/core/profiler/internal/BUILD +++ b/tensorflow/core/profiler/internal/BUILD @@ -370,7 +370,6 @@ tf_cuda_library( srcs = ["traceme_recorder.cc"], hdrs = ["traceme_recorder.h"], visibility = [ - "//learning/brain/runtime:__pkg__", # xprof_bridge "//perftools/accelerators/xprof/xprofilez:__pkg__", # alias xprof::TraceMeRecorder "//tensorflow/core:__pkg__", # executor.cc "//tensorflow/core/profiler/internal/cpu:__pkg__", # host_tracer @@ -435,7 +434,6 @@ tf_cc_test( "//tensorflow/core:test", "//tensorflow/core:test_main", "//tensorflow/core:testlib", - "//tensorflow/core/profiler/lib:profiler_session", "@com_google_absl//absl/strings", ], ) diff --git a/tensorflow/core/profiler/internal/scoped_annotation_test.cc b/tensorflow/core/profiler/internal/scoped_annotation_test.cc index ddf8c3dbf99..53164f72fdb 100644 --- a/tensorflow/core/profiler/internal/scoped_annotation_test.cc +++ b/tensorflow/core/profiler/internal/scoped_annotation_test.cc @@ -13,22 +13,38 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -/* - * bazel run -c opt --config cuda --dynamic_mode=off \ - * --define tf_use_oss_timeline_nonprod=1 \ - * third_party/tensorflow/core/profiler/internal:scoped_annotation_test \ - * -- --benchmarks=all - */ - #include "absl/strings/str_cat.h" +#include "tensorflow/core/platform/annotation.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/test_benchmark.h" #include "tensorflow/core/platform/tracing.h" -#include "tensorflow/core/profiler/lib/profiler_session.h" namespace tensorflow { namespace { +TEST(ScopedAnnotation, Simple) { + { + tracing::ScopedAnnotation trace("blah"); + EXPECT_EQ(Annotation::CurrentAnnotation(), ""); // not enabled + } + + { + tracing::ScopedAnnotation::Enable(true); + tracing::ScopedAnnotation trace("blah"); + EXPECT_EQ(Annotation::CurrentAnnotation(), "blah"); // enabled + tracing::ScopedAnnotation::Enable(false); + } + { + tracing::ScopedAnnotation::Enable(true); + tracing::ScopedAnnotation outer("foo"); + tracing::ScopedAnnotation inner("bar"); + EXPECT_EQ(Annotation::CurrentAnnotation(), "foo::bar"); // enabled + tracing::ScopedAnnotation::Enable(false); + } + + EXPECT_EQ(Annotation::CurrentAnnotation(), ""); // not enabled +} + std::string GenerateRandomString(int length) { return std::string(length, 'a'); } @@ -48,13 +64,13 @@ BENCHMARK(BM_ScopedAnnotationDisabled)->Arg(8)->Arg(32)->Arg(128); void BM_ScopedAnnotationEnabled(int iters, int annotation_size) { testing::StopTiming(); std::string annotation = GenerateRandomString(annotation_size); - auto profiler_session = - tensorflow::ProfilerSession::Create(/*ProfilerContext*/ nullptr); + tracing::ScopedAnnotation::Enable(true); testing::StartTiming(); for (int i = 0; i < iters; i++) { tracing::ScopedAnnotation trace(annotation); } testing::StopTiming(); + tracing::ScopedAnnotation::Enable(false); } BENCHMARK(BM_ScopedAnnotationEnabled)->Arg(8)->Arg(32)->Arg(128); @@ -62,13 +78,13 @@ BENCHMARK(BM_ScopedAnnotationEnabled)->Arg(8)->Arg(32)->Arg(128); void BM_ScopedAnnotationEnabled_TwoParts(int iters, int annotation_size) { testing::StopTiming(); std::string annotation = GenerateRandomString(annotation_size); - auto profiler_session = - tensorflow::ProfilerSession::Create(/*ProfilerContext*/ nullptr); + tracing::ScopedAnnotation::Enable(true); testing::StartTiming(); for (int i = 0; i < iters; i++) { tracing::ScopedAnnotation trace(annotation, annotation); } testing::StopTiming(); + tracing::ScopedAnnotation::Enable(false); } BENCHMARK(BM_ScopedAnnotationEnabled_TwoParts)->Arg(8)->Arg(32)->Arg(128); @@ -76,31 +92,66 @@ BENCHMARK(BM_ScopedAnnotationEnabled_TwoParts)->Arg(8)->Arg(32)->Arg(128); void BM_ScopedAnnotationEnabled_Nested(int iters, int annotation_size) { testing::StopTiming(); std::string annotation = GenerateRandomString(annotation_size); - auto profiler_session = - tensorflow::ProfilerSession::Create(/*ProfilerContext*/ nullptr); + tracing::ScopedAnnotation::Enable(true); testing::StartTiming(); for (int i = 0; i < iters; i++) { tracing::ScopedAnnotation trace(annotation); { tracing::ScopedAnnotation trace(annotation); } } testing::StopTiming(); + tracing::ScopedAnnotation::Enable(false); } BENCHMARK(BM_ScopedAnnotationEnabled_Nested)->Arg(8)->Arg(32)->Arg(128); void BM_ScopedAnnotationEnabled_Adhoc(int iters, int annotation_size) { testing::StopTiming(); - auto profiler_session = - tensorflow::ProfilerSession::Create(/*ProfilerContext*/ nullptr); + tracing::ScopedAnnotation::Enable(true); testing::StartTiming(); for (int i = 0; i < iters; i++) { // generate the annotation on the fly. tracing::ScopedAnnotation trace(absl::StrCat(i, "-", i * i)); } testing::StopTiming(); + tracing::ScopedAnnotation::Enable(false); } BENCHMARK(BM_ScopedAnnotationEnabled_Adhoc)->Arg(8)->Arg(32)->Arg(128); +void BM_ScopedAnnotationDisabled_Lambda(int iters, int annotation_size) { + for (int i = 0; i < iters; i++) { + tracing::ScopedAnnotation trace( + [&]() { return absl::StrCat(i, "-", i * i); }); + } +} + +BENCHMARK(BM_ScopedAnnotationDisabled_Lambda)->Arg(8)->Arg(32)->Arg(128); + +void BM_ScopedAnnotationEnabled_Adhoc_Lambda(int iters, int annotation_size) { + tracing::ScopedAnnotation::Enable(true); + for (int i = 0; i < iters; i++) { + tracing::ScopedAnnotation trace( + [&]() { return absl::StrCat(i, "-", i * i); }); + } + tracing::ScopedAnnotation::Enable(false); +} + +BENCHMARK(BM_ScopedAnnotationEnabled_Adhoc_Lambda)->Arg(8)->Arg(32)->Arg(128); + +void BM_ScopedAnnotationEnabled_TwoPartsLambda(int iters, int annotation_size) { + testing::StopTiming(); + std::string annotation = GenerateRandomString(annotation_size); + tracing::ScopedAnnotation::Enable(true); + testing::StartTiming(); + for (int i = 0; i < iters; i++) { + tracing::ScopedAnnotation trace( + [&]() { return absl::StrCat(annotation, ":", annotation); }); + } + testing::StopTiming(); + tracing::ScopedAnnotation::Enable(false); +} + +BENCHMARK(BM_ScopedAnnotationEnabled_TwoPartsLambda)->Arg(8)->Arg(32)->Arg(128); + } // namespace } // namespace tensorflow From 3b2bf7e9483323aedfe62739def40462350d18dd Mon Sep 17 00:00:00 2001 From: Nupur Garg Date: Fri, 19 Jul 2019 17:47:22 -0700 Subject: [PATCH 0213/3053] Remove explicitly setting the targets to python version PY2. PiperOrigin-RevId: 259069693 --- tensorflow/lite/python/BUILD | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD index b9176a415e5..9316da8e94c 100644 --- a/tensorflow/lite/python/BUILD +++ b/tensorflow/lite/python/BUILD @@ -25,7 +25,6 @@ py_test( "//tensorflow/lite/python/testdata:interpreter_test_data", "//tensorflow/lite/python/testdata:test_delegate.so", ], - python_version = "PY2", srcs_version = "PY2AND3", tags = [ "no_windows", @@ -95,7 +94,6 @@ py_test( name = "lite_test", srcs = ["lite_test.py"], data = ["@tflite_mobilenet_ssd_quant_protobuf//:tflite_graph.pb"], - python_version = "PY2", shard_count = 4, srcs_version = "PY2AND3", tags = [ @@ -111,7 +109,6 @@ py_test( py_test( name = "lite_v2_test", srcs = ["lite_v2_test.py"], - python_version = "PY2", srcs_version = "PY2AND3", tags = [ "no_windows", @@ -126,7 +123,6 @@ py_test( py_test( name = "lite_flex_test", srcs = ["lite_flex_test.py"], - python_version = "PY2", srcs_version = "PY2AND3", tags = [ # TODO(b/111881877): Enable in oss after resolving op registry issues. @@ -143,7 +139,6 @@ py_test( py_test( name = "lite_mlir_test", srcs = ["lite_mlir_test.py"], - python_version = "PY2", srcs_version = "PY2AND3", tags = [ "no_oss", @@ -174,7 +169,6 @@ py_library( py_test( name = "util_test", srcs = ["util_test.py"], - python_version = "PY2", srcs_version = "PY2AND3", tags = [ "no_windows", @@ -240,7 +234,6 @@ py_library( py_test( name = "convert_test", srcs = ["convert_test.py"], - python_version = "PY2", srcs_version = "PY2AND3", deps = [ ":convert", @@ -272,7 +265,6 @@ py_library( py_test( name = "convert_saved_model_test", srcs = ["convert_saved_model_test.py"], - python_version = "PY2", srcs_version = "PY2AND3", tags = [ "no_windows", From 6dcc61a0aef77660354f81db285f028c3cfaf5af Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Fri, 19 Jul 2019 17:54:01 -0700 Subject: [PATCH 0214/3053] Use ObjectIdentitySet instead of set() PiperOrigin-RevId: 259070523 --- tensorflow/python/framework/auto_control_deps.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py index 2e656857e87..1b45286bfe9 100644 --- a/tensorflow/python/framework/auto_control_deps.py +++ b/tensorflow/python/framework/auto_control_deps.py @@ -27,6 +27,7 @@ from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import control_flow_util from tensorflow.python.ops import tensor_array_ops from tensorflow.python.util import nest +from tensorflow.python.util import object_identity from tensorflow.python.util import tf_decorator # Op types that should not run in program order, e.g. because they need to run @@ -110,7 +111,7 @@ class AutomaticControlDependencies(object): """ def __init__(self): - self._returned_tensors = set() + self._returned_tensors = object_identity.ObjectIdentitySet() self.ops_which_must_run = set() def mark_as_return(self, tensor): From 229dae116a1e13b9a6286a7a6bf26c5c3ab6bf28 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 19 Jul 2019 17:58:46 -0700 Subject: [PATCH 0215/3053] Disallow dataset iterators in Keras fit, predict, and evaluate. PiperOrigin-RevId: 259071119 --- .../python/keras/engine/sequential_test.py | 3 +- tensorflow/python/keras/engine/training.py | 74 +++++++------ .../keras/engine/training_dataset_test.py | 104 +----------------- .../keras/engine/training_eager_test.py | 23 +--- .../python/keras/engine/training_test.py | 5 +- .../python/keras/engine/training_v2_utils.py | 16 +-- .../python/keras/model_subclassing_test.py | 7 +- 7 files changed, 65 insertions(+), 167 deletions(-) diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py index 0dca345e117..babb37d6c37 100644 --- a/tensorflow/python/keras/engine/sequential_test.py +++ b/tensorflow/python/keras/engine/sequential_test.py @@ -153,9 +153,8 @@ class TestSequential(keras_parameterized.TestCase): dataset = dataset_ops.Dataset.from_tensor_slices((x, y)) dataset = dataset.repeat(100) dataset = dataset.batch(10) - iterator = dataset_ops.make_one_shot_iterator(dataset) - model.fit(iterator, epochs=1, steps_per_epoch=steps_per_epoch) + model.fit(dataset, epochs=1, steps_per_epoch=steps_per_epoch) self.assertTrue(model.built) self.assertEqual(len(model.weights), 2 * 2) self.assertFalse(model._is_graph_network) diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py index 1fefa5744cd..a415358ff03 100644 --- a/tensorflow/python/keras/engine/training.py +++ b/tensorflow/python/keras/engine/training.py @@ -465,11 +465,21 @@ class Model(network.Network): def _select_training_loop(self, inputs): """Select training loop for fit/eval/predict based on the inputs.""" + # TODO(kaftan) or TODO(scottzhu): This check should eventually be nicely + # integrated into the data adapters in the v2 loop. We can't do this yet + # because we currently have to fall back for unhandled data types. + if isinstance(inputs, (iterator_ops.Iterator, + iterator_ops.IteratorV2)): + raise ValueError('For performance reasons Keras `fit`, `evaluate` and' + '`predict` accept tf.data `Datasets` as input but not ' + 'iterators that have been manually generated from ' + 'Datasets by users. Please directly pass in the ' + 'original `Dataset` object instead of passing in ' + '`iter(dataset)`.') + # Experiment training loop with default DS path. if (context.executing_eagerly() and self._run_distributed - and not isinstance(inputs, (iterator_ops.Iterator, - iterator_ops.IteratorV2)) # TODO(scottzhu): Finish getting sequences working with the v2 loops. and not isinstance(inputs, (data_utils.Sequence)) and not distributed_training_utils.is_tpu_strategy( @@ -535,7 +545,7 @@ class Model(network.Network): (in case the model has multiple inputs). - A dict mapping input names to the corresponding array/tensors, if the model has named inputs. - - A `tf.data` dataset or a dataset iterator. Should return a tuple + - A `tf.data` dataset. Should return a tuple of either `(inputs, targets)` or `(inputs, targets, sample_weights)`. - A generator or `keras.utils.Sequence` returning `(inputs, targets)` @@ -543,14 +553,14 @@ class Model(network.Network): y: Target data. Like the input data `x`, it could be either Numpy array(s) or TensorFlow tensor(s). It should be consistent with `x` (you cannot have Numpy inputs and - tensor targets, or inversely). If `x` is a dataset, dataset - iterator, generator, or `keras.utils.Sequence` instance, `y` should + tensor targets, or inversely). If `x` is a dataset, generator, + or `keras.utils.Sequence` instance, `y` should not be specified (since targets will be obtained from `x`). batch_size: Integer or `None`. Number of samples per gradient update. If unspecified, `batch_size` will default to 32. Do not specify the `batch_size` if your data is in the - form of symbolic tensors, dataset, dataset iterators, + form of symbolic tensors, datasets, generators, or `keras.utils.Sequence` instances (since they generate batches). epochs: Integer. Number of epochs to train the model. @@ -577,7 +587,7 @@ class Model(network.Network): on this data at the end of each epoch. The validation data is selected from the last samples in the `x` and `y` data provided, before shuffling. This argument is - not supported when `x` is a dataset, dataset iterator, generator or + not supported when `x` is a dataset, generator or `keras.utils.Sequence` instance. validation_data: Data on which to evaluate the loss and any model metrics at the end of each epoch. @@ -586,7 +596,7 @@ class Model(network.Network): `validation_data` could be: - tuple `(x_val, y_val)` of Numpy arrays or tensors - tuple `(x_val, y_val, val_sample_weights)` of Numpy arrays - - dataset or a dataset iterator + - dataset For the first two cases, `batch_size` must be provided. For the last case, `validation_steps` must be provided. shuffle: Boolean (whether to shuffle the training data @@ -611,7 +621,7 @@ class Model(network.Network): to apply a different weight to every timestep of every sample. In this case you should make sure to specify `sample_weight_mode="temporal"` in `compile()`. This argument is not - supported when `x` is a dataset, dataset iterator, generator, or + supported when `x` is a dataset, generator, or `keras.utils.Sequence` instance, instead provide the sample_weights as the third element of `x`. initial_epoch: Integer. @@ -624,14 +634,14 @@ class Model(network.Network): TensorFlow data tensors, the default `None` is equal to the number of samples in your dataset divided by the batch size, or 1 if that cannot be determined. If x is a - `tf.data` dataset or a dataset iterator, and 'steps_per_epoch' + `tf.data` dataset, and 'steps_per_epoch' is None, the epoch will run until the input dataset is exhausted. This argument is not supported with array inputs. validation_steps: Only relevant if `validation_data` is provided and - is a dataset or dataset iterator. Total number of steps (batches of + is a `tf.data` dataset. Total number of steps (batches of samples) to draw before stopping when performing validation at the end of every epoch. If validation_data is a `tf.data` dataset - or a dataset iterator, and 'validation_steps' is None, validation + and 'validation_steps' is None, validation will run until the `validation_data` dataset is exhausted. validation_freq: Only relevant if validation data is provided. Integer or `collections.Container` instance (e.g. list, tuple, etc.). If an @@ -722,20 +732,20 @@ class Model(network.Network): (in case the model has multiple inputs). - A dict mapping input names to the corresponding array/tensors, if the model has named inputs. - - A `tf.data` dataset or a dataset iterator. + - A `tf.data` dataset. - A generator or `keras.utils.Sequence` instance. y: Target data. Like the input data `x`, it could be either Numpy array(s) or TensorFlow tensor(s). It should be consistent with `x` (you cannot have Numpy inputs and tensor targets, or inversely). - If `x` is a dataset, dataset iterator, generator or + If `x` is a dataset, generator or `keras.utils.Sequence` instance, `y` should not be specified (since targets will be obtained from the iterator/dataset). batch_size: Integer or `None`. Number of samples per gradient update. If unspecified, `batch_size` will default to 32. Do not specify the `batch_size` is your data is in the - form of symbolic tensors, dataset, dataset iterators, + form of symbolic tensors, dataset, generators, or `keras.utils.Sequence` instances (since they generate batches). verbose: 0 or 1. Verbosity mode. @@ -751,13 +761,13 @@ class Model(network.Network): to apply a different weight to every timestep of every sample. In this case you should make sure to specify `sample_weight_mode="temporal"` in `compile()`. This argument is not - supported when `x` is a dataset or a dataset iterator, instead pass + supported when `x` is a dataset, instead pass sample weights as the third element of `x`. steps: Integer or `None`. Total number of steps (batches of samples) before declaring the evaluation round finished. Ignored with the default value of `None`. - If x is a `tf.data` dataset or a dataset iterator, and `steps` is + If x is a `tf.data` dataset and `steps` is None, 'evaluate' will run until the dataset is exhausted. This argument is not supported with array inputs. callbacks: List of `keras.callbacks.Callback` instances. @@ -822,20 +832,20 @@ class Model(network.Network): (in case the model has multiple inputs). - A TensorFlow tensor, or a list of tensors (in case the model has multiple inputs). - - A `tf.data` dataset or a dataset iterator. + - A `tf.data` dataset. - A generator or `keras.utils.Sequence` instance. batch_size: Integer or `None`. Number of samples per gradient update. If unspecified, `batch_size` will default to 32. Do not specify the `batch_size` is your data is in the - form of symbolic tensors, dataset, dataset iterators, + form of symbolic tensors, dataset, generators, or `keras.utils.Sequence` instances (since they generate batches). verbose: Verbosity mode, 0 or 1. steps: Total number of steps (batches of samples) before declaring the prediction round finished. Ignored with the default value of `None`. If x is a `tf.data` - dataset or a dataset iterator, and `steps` is None, `predict` will + dataset and `steps` is None, `predict` will run until the input dataset is exhausted. callbacks: List of `keras.callbacks.Callback` instances. List of callbacks to apply during prediction. @@ -904,11 +914,11 @@ class Model(network.Network): (in case the model has multiple inputs). - A dict mapping input names to the corresponding array/tensors, if the model has named inputs. - - A `tf.data` dataset or a dataset iterator. + - A `tf.data` dataset. y: Target data. Like the input data `x`, it could be either Numpy array(s) or TensorFlow tensor(s). It should be consistent with `x` (you cannot have Numpy inputs and tensor targets, or inversely). If - `x` is a dataset or a dataset iterator, `y` should not be specified + `x` is a dataset, `y` should not be specified (since targets will be obtained from the iterator). sample_weight: Optional array of the same length as x, containing weights to apply to the model's loss for each sample. In the case of @@ -916,7 +926,7 @@ class Model(network.Network): sequence_length), to apply a different weight to every timestep of every sample. In this case you should make sure to specify sample_weight_mode="temporal" in compile(). This argument is not - supported when `x` is a dataset or a dataset iterator. + supported when `x` is a dataset. class_weight: Optional dictionary mapping class indices (integers) to a weight (float) to apply to the model's loss for the samples from this class during training. This can be useful to tell the model to "pay @@ -993,13 +1003,12 @@ class Model(network.Network): (in case the model has multiple inputs). - A dict mapping input names to the corresponding array/tensors, if the model has named inputs. - - A `tf.data` dataset or a dataset iterator. + - A `tf.data` dataset. y: Target data. Like the input data `x`, it could be either Numpy array(s) or TensorFlow tensor(s). It should be consistent with `x` (you cannot have Numpy inputs and - tensor targets, or inversely). If `x` is a dataset or a - dataset iterator, `y` should not be specified - (since targets will be obtained from the iterator). + tensor targets, or inversely). If `x` is a dataset `y` should + not be specified (since targets will be obtained from the iterator). sample_weight: Optional array of the same length as x, containing weights to apply to the model's loss for each sample. In the case of temporal data, you can pass a 2D array @@ -1007,7 +1016,7 @@ class Model(network.Network): to apply a different weight to every timestep of every sample. In this case you should make sure to specify sample_weight_mode="temporal" in compile(). This argument is not - supported when `x` is a dataset or a dataset iterator. + supported when `x` is a dataset. reset_metrics: If `True`, the metrics returned will be only for this batch. If `False`, the metrics will be statefully accumulated across batches. @@ -1068,7 +1077,7 @@ class Model(network.Network): (in case the model has multiple inputs). - A TensorFlow tensor, or a list of tensors (in case the model has multiple inputs). - - A `tf.data` dataset or a dataset iterator. + - A `tf.data` dataset. Returns: Numpy array(s) of predictions. @@ -2221,13 +2230,12 @@ class Model(network.Network): (in case the model has multiple inputs). - A dict mapping input names to the corresponding array/tensors, if the model has named inputs. - - A `tf.data` dataset or a dataset iterator. + - A `tf.data` dataset. y: Target data. Like the input data `x`, it could be either Numpy array(s) or TensorFlow tensor(s). It should be consistent with `x` (you cannot have Numpy inputs and - tensor targets, or inversely). If `x` is a dataset or a - dataset iterator, `y` should not be specified - (since targets will be obtained from the iterator). + tensor targets, or inversely). If `x` is a dataset, `y` should not be + specified (since targets will be obtained from the iterator). sample_weight: An optional sample-weight array passed by the user to weight the importance of each sample in `x`. class_weight: An optional class-weight array by the user to diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py index cd3613198fd..145465b9f3b 100644 --- a/tensorflow/python/keras/engine/training_dataset_test.py +++ b/tensorflow/python/keras/engine/training_dataset_test.py @@ -47,100 +47,6 @@ class BatchCounterCallback(callbacks.Callback): self.batch_count += 1 -class TestTrainingWithDatasetIterators(keras_parameterized.TestCase): - - @keras_parameterized.run_with_all_model_types - @keras_parameterized.run_all_keras_modes - def test_training_and_eval_methods_on_iterators_single_io(self): - model = testing_utils.get_small_mlp(1, 4, input_dim=3) - optimizer = 'rmsprop' - loss = 'mse' - metrics = ['mae', metrics_module.CategoricalAccuracy()] - model.compile( - optimizer, - loss, - metrics=metrics, - run_eagerly=testing_utils.should_run_eagerly(), - run_distributed=testing_utils.should_run_distributed()) - - inputs = np.zeros((10, 3), np.float32) - targets = np.zeros((10, 4), np.float32) - dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) - dataset = dataset.repeat(100) - dataset = dataset.batch(10) - iterator = dataset_ops.make_one_shot_iterator(dataset) - - model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1) - model.evaluate(iterator, steps=2, verbose=1) - model.predict(iterator, steps=2) - - # Test with validation data - model.fit(iterator, - epochs=1, steps_per_epoch=2, verbose=0, - validation_data=iterator, validation_steps=2) - # Test with validation split - with self.assertRaisesRegexp( - ValueError, '`validation_split` argument is not supported when '): - model.fit(iterator, - epochs=1, steps_per_epoch=2, verbose=0, - validation_split=0.5, validation_steps=2) - - # Test with sample weight. - sample_weight = np.random.random((10,)) - with self.assertRaisesRegexp( - ValueError, '`sample_weight` argument is not supported ' - 'when input `x` is a dataset or a dataset iterator'): - model.fit( - iterator, - epochs=1, - steps_per_epoch=2, - verbose=0, - sample_weight=sample_weight) - - # Test invalid usage - with self.assertRaisesRegexp(ValueError, - 'you should not specify a target'): - model.fit(iterator, iterator, - epochs=1, steps_per_epoch=2, verbose=0) - - with self.assertRaisesRegexp( - ValueError, 'the `steps_per_epoch` argument'): - model.fit(iterator, epochs=1, verbose=0) - with self.assertRaisesRegexp(ValueError, - 'the `steps` argument'): - model.evaluate(iterator, verbose=0) - with self.assertRaisesRegexp(ValueError, - 'the `steps` argument'): - model.predict(iterator, verbose=0) - - @keras_parameterized.run_with_all_model_types - @keras_parameterized.run_all_keras_modes - def test_iterators_running_out_of_data(self): - model = testing_utils.get_small_mlp(1, 4, input_dim=3) - optimizer = 'rmsprop' - loss = 'mse' - metrics = ['mae'] - model.compile( - optimizer, - loss, - metrics=metrics, - run_eagerly=testing_utils.should_run_eagerly(), - run_distributed=testing_utils.should_run_distributed()) - - inputs = np.zeros((10, 3), np.float32) - targets = np.zeros((10, 4), np.float32) - dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) - dataset = dataset.repeat(2) - dataset = dataset.batch(10) - iterator = dataset_ops.make_one_shot_iterator(dataset) - - with test.mock.patch.object(logging, 'warning') as mock_log: - model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0) - self.assertRegexpMatches( - str(mock_log.call_args), - 'dataset iterator ran out of data') - - class TestTrainingWithDataset(keras_parameterized.TestCase): @keras_parameterized.run_with_all_model_types @@ -618,11 +524,11 @@ class TestTrainingWithDataset(keras_parameterized.TestCase): model.fit(dataset) -class TestMetricsWithDatasetIterators(keras_parameterized.TestCase): +class TestMetricsWithDatasets(keras_parameterized.TestCase): @keras_parameterized.run_with_all_model_types @keras_parameterized.run_all_keras_modes - def test_metrics_correctness_with_iterator(self): + def test_metrics_correctness_with_dataset(self): layers = [ keras.layers.Dense(8, activation='relu', input_dim=4, kernel_initializer='ones'), @@ -643,8 +549,7 @@ class TestMetricsWithDatasetIterators(keras_parameterized.TestCase): y = np.random.randint(2, size=(100, 1)).astype(np.float32) dataset = dataset_ops.Dataset.from_tensor_slices((x, y)) dataset = dataset.batch(10) - iterator = dataset_ops.make_one_shot_iterator(dataset) - outs = model.evaluate(iterator, steps=10) + outs = model.evaluate(dataset, steps=10) self.assertEqual(np.around(outs[1], decimals=1), 0.5) self.assertEqual(np.around(outs[2], decimals=1), 0.5) @@ -652,8 +557,7 @@ class TestMetricsWithDatasetIterators(keras_parameterized.TestCase): dataset = dataset_ops.Dataset.from_tensor_slices((x, y)) dataset = dataset.repeat(100) dataset = dataset.batch(10) - iterator = dataset_ops.make_one_shot_iterator(dataset) - outs = model.evaluate(iterator, steps=10) + outs = model.evaluate(dataset, steps=10) self.assertEqual(outs[1], 0.) self.assertEqual(outs[2], 0.) diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py index 57d2f50d2ec..e74c5b678d4 100644 --- a/tensorflow/python/keras/engine/training_eager_test.py +++ b/tensorflow/python/keras/engine/training_eager_test.py @@ -183,30 +183,20 @@ class TrainingTest(keras_parameterized.TestCase): x = array_ops.zeros(shape=(10, 3)) y = array_ops.zeros(shape=(10, 4)) dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat(10).batch(5) - iterator = dataset_ops.make_one_shot_iterator(dataset) validation_dataset = dataset_ops.Dataset.from_tensor_slices( (x, y)).repeat().batch(5) # Infinite dataset. - validation_iterator = dataset_ops.make_one_shot_iterator(validation_dataset) - with self.assertRaisesRegexp( - ValueError, r'specify .* `steps_per_epoch`'): - model.fit(iterator, epochs=1, verbose=0) - if not context.executing_eagerly(): - # In eager execution, `array_ops.zeros` returns value tensors - # which can be used for validation without a `validation_steps` argument. - with self.assertRaisesRegexp( - ValueError, r'provide either `batch_size` or `validation_steps`'): - model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0, - validation_data=(x, y)) + model.fit(dataset, epochs=1, verbose=0) + # Step argument is required for infinite datasets. with self.assertRaisesRegexp(ValueError, 'specify the `validation_steps` argument.'): - model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0, + model.fit(dataset, steps_per_epoch=2, epochs=1, verbose=0, validation_data=validation_dataset) with self.assertRaisesRegexp(ValueError, 'specify the `validation_steps` argument.'): - model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0, - validation_data=validation_iterator) + model.fit(dataset, steps_per_epoch=2, epochs=1, verbose=0, + validation_data=validation_dataset) # TODO(b/120931266): Enable test on subclassed models after bug causing an # extra dimension to be added to predict outputs is fixed. @@ -282,8 +272,7 @@ class CorrectnessTest(keras_parameterized.TestCase): dataset = dataset_ops.Dataset.from_tensor_slices((x, y)) dataset = dataset.repeat(100) dataset = dataset.batch(10) - iterator = dataset_ops.make_one_shot_iterator(dataset) - history = model.fit(iterator, epochs=1, steps_per_epoch=10) + history = model.fit(dataset, epochs=1, steps_per_epoch=10) self.assertAlmostEqual(history.history['loss'][-1], 0.5836, 4) def test_loss_in_call(self): diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py index 9c82bc1a5ae..9f020221322 100644 --- a/tensorflow/python/keras/engine/training_test.py +++ b/tensorflow/python/keras/engine/training_test.py @@ -859,8 +859,7 @@ class TrainingTest(keras_parameterized.TestCase): dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train)) dataset = dataset.repeat(10) dataset = dataset.batch(10) - iterator = dataset_ops.make_one_shot_iterator(dataset) - model.fit(iterator, epochs=1, steps_per_epoch=2) + model.fit(dataset, epochs=1, steps_per_epoch=2) if context.executing_eagerly(): # Test with eager execution @@ -870,7 +869,7 @@ class TrainingTest(keras_parameterized.TestCase): model.fit(x_train, y_train, batch_size=5, epochs=1) # Test with eager execution and iterator - model.fit(iterator, epochs=1, steps_per_epoch=2) + model.fit(dataset, epochs=1, steps_per_epoch=2) def test_losses_in_defun(self): with context.eager_mode(): diff --git a/tensorflow/python/keras/engine/training_v2_utils.py b/tensorflow/python/keras/engine/training_v2_utils.py index 2f42a5f531b..982ef2a71a1 100644 --- a/tensorflow/python/keras/engine/training_v2_utils.py +++ b/tensorflow/python/keras/engine/training_v2_utils.py @@ -178,11 +178,11 @@ def train_on_batch( (in case the model has multiple inputs). - A dict mapping input names to the corresponding array/tensors, if the model has named inputs. - - A `tf.data` dataset or a dataset iterator. + - A `tf.data` dataset. y: Target data. Like the input data `x`, it could be either Numpy array(s) or TensorFlow tensor(s). It should be consistent with `x` (you cannot have Numpy inputs and tensor targets, or inversely). If - `x` is a dataset or a dataset iterator, `y` should not be specified + `x` is a dataset `y` should not be specified (since targets will be obtained from the iterator). sample_weight: Optional array of the same length as x, containing weights to apply to the model's loss for each sample. In the case of @@ -190,7 +190,7 @@ def train_on_batch( sequence_length), to apply a different weight to every timestep of every sample. In this case you should make sure to specify sample_weight_mode="temporal" in compile(). This argument is not - supported when `x` is a dataset or a dataset iterator. + supported when `x` is a dataset. class_weight: Optional dictionary mapping class indices (integers) to a weight (float) to apply to the model's loss for the samples from this class during training. This can be useful to tell the model to "pay @@ -249,12 +249,12 @@ def test_on_batch(model, x, y=None, sample_weight=None, reset_metrics=True): (in case the model has multiple inputs). - A dict mapping input names to the corresponding array/tensors, if the model has named inputs. - - A `tf.data` dataset or a dataset iterator. + - A `tf.data` dataset. y: Target data. Like the input data `x`, it could be either Numpy array(s) or TensorFlow tensor(s). It should be consistent with `x` (you cannot have Numpy inputs and - tensor targets, or inversely). If `x` is a dataset or a - dataset iterator, `y` should not be specified + tensor targets, or inversely). If `x` is a dataset, + `y` should not be specified (since targets will be obtained from the iterator). sample_weight: Optional array of the same length as x, containing weights to apply to the model's loss for each sample. @@ -263,7 +263,7 @@ def test_on_batch(model, x, y=None, sample_weight=None, reset_metrics=True): to apply a different weight to every timestep of every sample. In this case you should make sure to specify sample_weight_mode="temporal" in compile(). This argument is not - supported when `x` is a dataset or a dataset iterator. + supported when `x` is a dataset. reset_metrics: If `True`, the metrics returned will be only for this batch. If `False`, the metrics will be statefully accumulated across batches. @@ -310,7 +310,7 @@ def predict_on_batch(model, x): (in case the model has multiple inputs). - A TensorFlow tensor, or a list of tensors (in case the model has multiple inputs). - - A `tf.data` dataset or a dataset iterator. + - A `tf.data` dataset. Returns: Numpy array(s) of predictions. diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py index eecb3b5bd20..39d6594a318 100644 --- a/tensorflow/python/keras/model_subclassing_test.py +++ b/tensorflow/python/keras/model_subclassing_test.py @@ -646,7 +646,7 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase): model.fit([x1, x2], [y1, y2], epochs=2, batch_size=32, verbose=0) _ = model.evaluate([x1, x2], [y1, y2], verbose=0) - def test_single_io_workflow_with_dataset_iterators(self): + def test_single_io_workflow_with_datasets(self): num_classes = 2 num_samples = 10 input_dim = 50 @@ -664,10 +664,9 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase): dataset = dataset_ops.Dataset.from_tensor_slices((x, y)) dataset = dataset.repeat(100) dataset = dataset.batch(10) - iterator = dataset_ops.make_one_shot_iterator(dataset) - model.fit(iterator, epochs=2, steps_per_epoch=10, verbose=0) - _ = model.evaluate(iterator, steps=10, verbose=0) + model.fit(dataset, epochs=2, steps_per_epoch=10, verbose=0) + _ = model.evaluate(dataset, steps=10, verbose=0) def test_attributes(self): # layers, weights, trainable_weights, non_trainable_weights, inputs, outputs From ba1654087a4966bd85328f399ed6f288da4b84db Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Fri, 19 Jul 2019 18:03:08 -0700 Subject: [PATCH 0216/3053] Avoid using equality for adding weights PiperOrigin-RevId: 259071753 --- tensorflow/python/keras/engine/base_layer.py | 31 ++++++++++++-------- 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py index 4cd6fa74819..5663ff16745 100644 --- a/tensorflow/python/keras/engine/base_layer.py +++ b/tensorflow/python/keras/engine/base_layer.py @@ -2163,18 +2163,25 @@ class Layer(module.Module): for val in nest.flatten(value): # TODO(b/126450014): Remove `_UnreadVariable` check here when assign ops # no longer return True for isinstance Variable checks. - if (isinstance(val, tf_variables.Variable) and - not isinstance(val, resource_variable_ops._UnreadVariable)): # pylint: disable=protected-access - # Users may add extra weights/variables - # simply by assigning them to attributes (invalid for graph networks) - self._maybe_create_attribute('_trainable_weights', []) - self._maybe_create_attribute('_non_trainable_weights', []) - if val not in self._trainable_weights + self._non_trainable_weights: - if val.trainable: - self._trainable_weights.append(val) - else: - self._non_trainable_weights.append(val) - backend.track_variable(val) + if not isinstance(val, tf_variables.Variable): + continue + if isinstance(val, resource_variable_ops._UnreadVariable): # pylint: disable=protected-access + continue + + # Users may add extra weights/variables + # simply by assigning them to attributes (invalid for graph networks) + self._maybe_create_attribute('_trainable_weights', []) + self._maybe_create_attribute('_non_trainable_weights', []) + if val.trainable: + if any(val is w for w in self._trainable_weights): + continue + self._trainable_weights.append(val) + else: + if any(val is w for w in self._non_trainable_weights): + continue + self._non_trainable_weights.append(val) + + backend.track_variable(val) # Skip the auto trackable from tf.Module to keep status quo. See the comment # at __delattr__. From fdc106e412b9dc67444e8ab15de3cce6b3298e93 Mon Sep 17 00:00:00 2001 From: Yu-Cheng Ling Date: Fri, 19 Jul 2019 18:13:25 -0700 Subject: [PATCH 0217/3053] Handle IdentityN with 1 input in Toco PiperOrigin-RevId: 259072753 --- .../lite/testing/generate_examples_lib.py | 11 +- tensorflow/lite/toco/import_tensorflow.cc | 355 +++++++++--------- 2 files changed, 189 insertions(+), 177 deletions(-) diff --git a/tensorflow/lite/testing/generate_examples_lib.py b/tensorflow/lite/testing/generate_examples_lib.py index 792bf50d16a..472caae8b9f 100644 --- a/tensorflow/lite/testing/generate_examples_lib.py +++ b/tensorflow/lite/testing/generate_examples_lib.py @@ -870,7 +870,7 @@ def make_identity_tests(options): # Chose a set of parameters test_parameters = [{ "input_shape": [[], [1], [3, 3]], - "use_snapshot": [False, True], + "op_to_use": ["identity", "identity_n", "snapshot"], }] def build_graph(parameters): @@ -884,10 +884,13 @@ def make_identity_tests(options): # shape, this conversion still fails. # TODO(b/129197312), remove the walk-around code once the bug is fixed. input_doubled = input_tensor * 2.0 - if parameters["use_snapshot"]: - identity_output = array_ops.snapshot(input_doubled) - else: + if parameters["op_to_use"] == "identity": identity_output = tf.identity(input_doubled) + elif parameters["op_to_use"] == "identity_n": + # Testing `IdentityN` with a single tensor. + identity_output = tf.identity_n([input_doubled])[0] + elif parameters["op_to_use"] == "snapshot": + identity_output = array_ops.snapshot(input_doubled) return [input_tensor], [identity_output] def build_inputs(parameters, sess, inputs, outputs): diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc index 859fa0f6147..17c7d718dcb 100644 --- a/tensorflow/lite/toco/import_tensorflow.cc +++ b/tensorflow/lite/toco/import_tensorflow.cc @@ -562,6 +562,178 @@ void RetainTensorFlowNodeDef(const NodeDef& node, Operator* op) { node.SerializeToString(&op->tensorflow_node_def); } +void GetOutputNamesFromNodeDef(const NodeDef& node, + const tensorflow::OpDef& op_def, + TensorFlowUnsupportedOperator* op) { + int next_output = 0; + auto add_output = [&node, &next_output, op]() { + if (next_output == 0) { + op->outputs.push_back(node.name()); // Implicit :0. + } else { + op->outputs.push_back(absl::StrCat(node.name(), ":", next_output)); + } + ++next_output; + }; + for (int i = 0; i < op_def.output_arg_size(); ++i) { + string multiples = op_def.output_arg(i).number_attr(); + if (!multiples.empty()) { + CHECK(HasAttr(node, multiples)) << "No attr named " << multiples; + int num_outputs = GetIntAttr(node, multiples); + for (int j = 0; j < num_outputs; ++j) { + add_output(); + } + } else { + string list = op_def.output_arg(i).type_list_attr(); + if (!list.empty()) { + CHECK(HasAttr(node, list)) << "No attr named " << list; + const AttrValue::ListValue& list_value = GetListAttr(node, list); + for (int j = 0; j < list_value.type_size(); ++j) { + add_output(); + } + } else { + add_output(); + } + } + } +} + +void GetOutputTypesFromNodeDef(const NodeDef& node, + const tensorflow::OpDef& op_def, + TensorFlowUnsupportedOperator* op) { + // The given type to the op, or clear the types if invalid. + auto add_type = [&node, op](tensorflow::DataType type) { + if (type == tensorflow::DT_INVALID) { + LOG(WARNING) << "Op node missing output type attribute: " << node.name(); + op->output_data_types.clear(); + } else { + op->output_data_types.push_back(ConvertDataType(type)); + } + }; + + // Retrieve the data type according to the OpDef definition: either the + // "type" or "type_attr" field will be set. + auto get_type = [&node](const tensorflow::OpDef::ArgDef& a) { + if (a.type() != tensorflow::DT_INVALID) { + return a.type(); + } else if (HasAttr(node, a.type_attr())) { + return GetDataTypeAttr(node, a.type_attr()); + } else { + return tensorflow::DT_INVALID; + } + }; + + for (int i = 0; i < op_def.output_arg_size(); ++i) { + string multiples = op_def.output_arg(i).number_attr(); + if (!multiples.empty()) { + CHECK(HasAttr(node, multiples)) << "No attr named " << multiples; + int num_outputs = GetIntAttr(node, multiples); + auto type = get_type(op_def.output_arg(i)); + for (int j = 0; j < num_outputs; ++j) { + add_type(type); + } + } else { + string list = op_def.output_arg(i).type_list_attr(); + if (!list.empty()) { + CHECK(HasAttr(node, list)) << "No attr named " << list; + const AttrValue::ListValue& list_value = GetListAttr(node, list); + for (int j = 0; j < list_value.type_size(); ++j) { + add_type(list_value.type(j)); + } + } else { + add_type(get_type(op_def.output_arg(i))); + } + } + } +} + +tensorflow::Status ConvertUnsupportedOperator( + const NodeDef& node, const TensorFlowImportFlags& tf_import_flags, + const ModelFlags& model_flags, Model* model) { + // Names of special attributes in TF graph that are used by Toco. + static constexpr char kAttrOutputQuantized[] = "_output_quantized"; + static constexpr char kAttrOutputTypes[] = "_output_types"; + static constexpr char kAttrOutputShapes[] = "_output_shapes"; + static constexpr char kAttrSupportOutputTypeFloatInQuantizedOp[] = + "_support_output_type_float_in_quantized_op"; + + LOG(INFO) << "Converting unsupported operation: " << node.op(); + + auto* op = new TensorFlowUnsupportedOperator; + op->tensorflow_op = node.op(); + + // For Flex mode. Please read the comments of the function. + RetainTensorFlowNodeDef(node, op); + + model->operators.emplace_back(op); + + // Parse inputs. + const int num_inputs = GetInputsCount(node, tf_import_flags); + for (int i = 0; i < num_inputs; ++i) { + op->inputs.push_back(node.input(i)); + } + + // Parse outputs. Name them after the node's name, plus an ordinal suffix. + // Note that some outputs are to be multiplied by a named attribute. + const tensorflow::OpDef* op_def = nullptr; + if (tensorflow::OpRegistry::Global()->LookUpOpDef(node.op(), &op_def).ok()) { + GetOutputNamesFromNodeDef(node, *op_def, op); + } else { + op->outputs.push_back(node.name()); // Implicit :0. + } + + // Parse if the op supports quantization + if (HasAttr(node, kAttrOutputQuantized)) { + op->quantized = GetBoolAttr(node, kAttrOutputQuantized); + } + // Parse if the quantized op allows output arrays of type float + if (HasAttr(node, kAttrSupportOutputTypeFloatInQuantizedOp)) { + op->support_output_type_float_in_quantized_op = + GetBoolAttr(node, kAttrSupportOutputTypeFloatInQuantizedOp); + } + + // Parse output type(s). + if (HasAttr(node, kAttrOutputTypes)) { + const auto& output_types = GetListAttr(node, kAttrOutputTypes); + for (int i = 0; i < output_types.type_size(); ++i) { + op->output_data_types.push_back(ConvertDataType(output_types.type(i))); + } + } else if (HasAttr(node, "Tout")) { + const auto& output_type = GetDataTypeAttr(node, "Tout"); + op->output_data_types.push_back(ConvertDataType(output_type)); + } else if (op_def != nullptr) { + GetOutputTypesFromNodeDef(node, *op_def, op); + } else { + // TODO(b/113613439): Figure out how to propagate types for custom ops + // that have no OpDef. + LOG(INFO) << "Unable to determine output type for op: " << node.op(); + } + + // Parse output shape(s). + if (HasAttr(node, kAttrOutputShapes)) { + const auto& output_shapes = GetListAttr(node, kAttrOutputShapes); + Shape output_shape; + for (int i = 0; i < output_shapes.shape_size(); ++i) { + const auto& shape = output_shapes.shape(i); + // TOCO doesn't yet properly handle shapes with wildcard dimensions. + // TODO(b/113613439): Handle shape inference for unsupported ops that have + // shapes with wildcard dimensions. + if (HasWildcardDimension(shape)) { + LOG(INFO) << "Skipping wildcard output shape(s) for node: " + << node.name(); + op->output_shapes.clear(); + break; + } + const auto status = + ImportShape(shape.dim(), /*input_flat_size=*/nullptr, &output_shape); + if (!status.ok()) { + return status; + } + op->output_shapes.push_back(output_shape); + } + } + return tensorflow::Status::OK(); +} + tensorflow::Status ConvertConstOperator( const NodeDef& node, const TensorFlowImportFlags& tf_import_flags, const ModelFlags& model_flags, Model* model) { @@ -839,7 +1011,15 @@ tensorflow::Status ConvertIdentityOperator( const ModelFlags& model_flags, Model* model) { CHECK(node.op() == "Identity" || node.op() == "CheckNumerics" || node.op() == "PlaceholderWithDefault" || node.op() == "StopGradient" || - node.op() == "Snapshot"); + node.op() == "Snapshot" || node.op() == "IdentityN"); + + if (node.op() == "IdentityN" && node.input_size() != 1) { + // When IdentityN doesn't have exactly 1 input, convert it as an unsupported + // op so it's still possible to run with Flex runtime. + return ConvertUnsupportedOperator(node, tf_import_flags, model_flags, + model); + } + auto* op = new TensorFlowIdentityOperator; // Amazingly, some TensorFlow graphs (at least rajeev_lstm.pb) have // identity nodes with multiple inputs, but the other inputs seem @@ -1239,178 +1419,6 @@ tensorflow::Status ConvertSimpleOperatorFlexOk( node, tf_import_flags, model_flags, model); } -void GetOutputNamesFromNodeDef(const NodeDef& node, - const tensorflow::OpDef& op_def, - TensorFlowUnsupportedOperator* op) { - int next_output = 0; - auto add_output = [&node, &next_output, op]() { - if (next_output == 0) { - op->outputs.push_back(node.name()); // Implicit :0. - } else { - op->outputs.push_back(absl::StrCat(node.name(), ":", next_output)); - } - ++next_output; - }; - for (int i = 0; i < op_def.output_arg_size(); ++i) { - string multiples = op_def.output_arg(i).number_attr(); - if (!multiples.empty()) { - CHECK(HasAttr(node, multiples)) << "No attr named " << multiples; - int num_outputs = GetIntAttr(node, multiples); - for (int j = 0; j < num_outputs; ++j) { - add_output(); - } - } else { - string list = op_def.output_arg(i).type_list_attr(); - if (!list.empty()) { - CHECK(HasAttr(node, list)) << "No attr named " << list; - const AttrValue::ListValue& list_value = GetListAttr(node, list); - for (int j = 0; j < list_value.type_size(); ++j) { - add_output(); - } - } else { - add_output(); - } - } - } -} - -void GetOutputTypesFromNodeDef(const NodeDef& node, - const tensorflow::OpDef& op_def, - TensorFlowUnsupportedOperator* op) { - // The given type to the op, or clear the types if invalid. - auto add_type = [&node, op](tensorflow::DataType type) { - if (type == tensorflow::DT_INVALID) { - LOG(WARNING) << "Op node missing output type attribute: " << node.name(); - op->output_data_types.clear(); - } else { - op->output_data_types.push_back(ConvertDataType(type)); - } - }; - - // Retrieve the data type according to the OpDef definition: either the - // "type" or "type_attr" field will be set. - auto get_type = [&node](const tensorflow::OpDef::ArgDef& a) { - if (a.type() != tensorflow::DT_INVALID) { - return a.type(); - } else if (HasAttr(node, a.type_attr())) { - return GetDataTypeAttr(node, a.type_attr()); - } else { - return tensorflow::DT_INVALID; - } - }; - - for (int i = 0; i < op_def.output_arg_size(); ++i) { - string multiples = op_def.output_arg(i).number_attr(); - if (!multiples.empty()) { - CHECK(HasAttr(node, multiples)) << "No attr named " << multiples; - int num_outputs = GetIntAttr(node, multiples); - auto type = get_type(op_def.output_arg(i)); - for (int j = 0; j < num_outputs; ++j) { - add_type(type); - } - } else { - string list = op_def.output_arg(i).type_list_attr(); - if (!list.empty()) { - CHECK(HasAttr(node, list)) << "No attr named " << list; - const AttrValue::ListValue& list_value = GetListAttr(node, list); - for (int j = 0; j < list_value.type_size(); ++j) { - add_type(list_value.type(j)); - } - } else { - add_type(get_type(op_def.output_arg(i))); - } - } - } -} - -tensorflow::Status ConvertUnsupportedOperator( - const NodeDef& node, const TensorFlowImportFlags& tf_import_flags, - const ModelFlags& model_flags, Model* model) { - // Names of special attributes in TF graph that are used by Toco. - static constexpr char kAttrOutputQuantized[] = "_output_quantized"; - static constexpr char kAttrOutputTypes[] = "_output_types"; - static constexpr char kAttrOutputShapes[] = "_output_shapes"; - static constexpr char kAttrSupportOutputTypeFloatInQuantizedOp[] = - "_support_output_type_float_in_quantized_op"; - - LOG(INFO) << "Converting unsupported operation: " << node.op(); - - auto* op = new TensorFlowUnsupportedOperator; - op->tensorflow_op = node.op(); - - // For Flex mode. Please read the comments of the function. - RetainTensorFlowNodeDef(node, op); - - model->operators.emplace_back(op); - - // Parse inputs. - const int num_inputs = GetInputsCount(node, tf_import_flags); - for (int i = 0; i < num_inputs; ++i) { - op->inputs.push_back(node.input(i)); - } - - // Parse outputs. Name them after the node's name, plus an ordinal suffix. - // Note that some outputs are to be multiplied by a named attribute. - const tensorflow::OpDef* op_def = nullptr; - if (tensorflow::OpRegistry::Global()->LookUpOpDef(node.op(), &op_def).ok()) { - GetOutputNamesFromNodeDef(node, *op_def, op); - } else { - op->outputs.push_back(node.name()); // Implicit :0. - } - - // Parse if the op supports quantization - if (HasAttr(node, kAttrOutputQuantized)) { - op->quantized = GetBoolAttr(node, kAttrOutputQuantized); - } - // Parse if the quantized op allows output arrays of type float - if (HasAttr(node, kAttrSupportOutputTypeFloatInQuantizedOp)) { - op->support_output_type_float_in_quantized_op = - GetBoolAttr(node, kAttrSupportOutputTypeFloatInQuantizedOp); - } - - // Parse output type(s). - if (HasAttr(node, kAttrOutputTypes)) { - const auto& output_types = GetListAttr(node, kAttrOutputTypes); - for (int i = 0; i < output_types.type_size(); ++i) { - op->output_data_types.push_back(ConvertDataType(output_types.type(i))); - } - } else if (HasAttr(node, "Tout")) { - const auto& output_type = GetDataTypeAttr(node, "Tout"); - op->output_data_types.push_back(ConvertDataType(output_type)); - } else if (op_def != nullptr) { - GetOutputTypesFromNodeDef(node, *op_def, op); - } else { - // TODO(b/113613439): Figure out how to propagate types for custom ops - // that have no OpDef. - LOG(INFO) << "Unable to determine output type for op: " << node.op(); - } - - // Parse output shape(s). - if (HasAttr(node, kAttrOutputShapes)) { - const auto& output_shapes = GetListAttr(node, kAttrOutputShapes); - Shape output_shape; - for (int i = 0; i < output_shapes.shape_size(); ++i) { - const auto& shape = output_shapes.shape(i); - // TOCO doesn't yet properly handle shapes with wildcard dimensions. - // TODO(b/113613439): Handle shape inference for unsupported ops that have - // shapes with wildcard dimensions. - if (HasWildcardDimension(shape)) { - LOG(INFO) << "Skipping wildcard output shape(s) for node: " - << node.name(); - op->output_shapes.clear(); - break; - } - const auto status = - ImportShape(shape.dim(), /*input_flat_size=*/nullptr, &output_shape); - if (!status.ok()) { - return status; - } - op->output_shapes.push_back(output_shape); - } - } - return tensorflow::Status::OK(); -} - // Same as ConvertConstOperator, but revert to ConvertUnsupportedOperator if // the types are not supported. Converting Const operators here avoids // expensive copies of the protocol buffers downstream in the flex delegate. @@ -2504,6 +2512,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() { {"GreaterEqual", ConvertSimpleOperator}, {"Identity", ConvertIdentityOperator}, + {"IdentityN", ConvertIdentityOperator}, {"LRN", ConvertLRNOperator}, {"LeakyRelu", ConvertLeakyReluOperator}, {"LegacyFedInput", ConvertPlaceholderOperator}, From 8f55026cc829952921c0e9fe403caaf734645637 Mon Sep 17 00:00:00 2001 From: Jared Duke Date: Fri, 19 Jul 2019 18:18:43 -0700 Subject: [PATCH 0218/3053] Teach TFLite model verifier about all supported types PiperOrigin-RevId: 259073172 --- tensorflow/lite/tools/BUILD | 3 +- tensorflow/lite/tools/verifier.cc | 24 ++++++++++----- tensorflow/lite/tools/verifier_test.cc | 41 +++++++++++++++++++++++++- 3 files changed, 59 insertions(+), 9 deletions(-) diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD index 3f448b1e5cc..38fc69e8408 100644 --- a/tensorflow/lite/tools/BUILD +++ b/tensorflow/lite/tools/BUILD @@ -91,7 +91,8 @@ cc_test( "//tensorflow/core:framework_lite", "//tensorflow/lite:framework", "//tensorflow/lite:schema_fbs_version", - "//tensorflow/lite/c:c_api_internal", + "//tensorflow/lite:util", + "//tensorflow/lite/core/api", "//tensorflow/lite/schema:schema_fbs", "//tensorflow/lite/testing:util", "@com_google_googletest//:gtest", diff --git a/tensorflow/lite/tools/verifier.cc b/tensorflow/lite/tools/verifier.cc index 04833ed69d7..16ae0a6651d 100644 --- a/tensorflow/lite/tools/verifier.cc +++ b/tensorflow/lite/tools/verifier.cc @@ -130,20 +130,30 @@ bool VerifyNumericTensorBuffer(const Tensor& tensor, const Buffer& buffer, case TensorType_FLOAT32: bytes_required *= sizeof(float); break; - case TensorType_INT8: - bytes_required *= sizeof(int8_t); - break; - case TensorType_UINT8: - bytes_required *= sizeof(uint8_t); + case TensorType_FLOAT16: + bytes_required *= sizeof(uint16_t); break; case TensorType_INT32: bytes_required *= sizeof(int32_t); break; + case TensorType_UINT8: + bytes_required *= sizeof(uint8_t); + break; + case TensorType_INT8: + bytes_required *= sizeof(int8_t); + break; case TensorType_INT64: bytes_required *= sizeof(int64_t); break; - case TensorType_FLOAT16: - // FALLTHROUGH_INTENDED; + case TensorType_BOOL: + bytes_required *= sizeof(bool); + break; + case TensorType_INT16: + bytes_required *= sizeof(uint16_t); + break; + case TensorType_COMPLEX64: + bytes_required *= sizeof(std::complex); + break; default: ReportError(error_reporter, "Tensor %s invalid type: %d", tensor.name()->c_str(), tensor.type()); diff --git a/tensorflow/lite/tools/verifier_test.cc b/tensorflow/lite/tools/verifier_test.cc index c89a6fb10d1..ca3a9d63959 100644 --- a/tensorflow/lite/tools/verifier_test.cc +++ b/tensorflow/lite/tools/verifier_test.cc @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "tensorflow/lite/tools/verifier.h" + #include #include @@ -21,11 +23,12 @@ limitations under the License. #include #include "tensorflow/core/framework/numeric_types.h" #include "tensorflow/lite/allocation.h" +#include "tensorflow/lite/core/api/flatbuffer_conversions.h" #include "tensorflow/lite/error_reporter.h" #include "tensorflow/lite/op_resolver.h" #include "tensorflow/lite/schema/schema_generated.h" #include "tensorflow/lite/testing/util.h" -#include "tensorflow/lite/tools/verifier.h" +#include "tensorflow/lite/util.h" #include "tensorflow/lite/version.h" namespace tflite { @@ -516,6 +519,42 @@ TEST(VerifyModel, OpWithOptionalTensor) { EXPECT_EQ("", builder.GetErrorString()); } +TEST(VerifyModel, TypedTensorShapeMismatchWithTensorBufferSize) { + TfLiteFlatbufferModelBuilder builder; + for (int tensor_type = TensorType_MIN; tensor_type <= TensorType_MAX; + ++tensor_type) { + if (tensor_type == TensorType_STRING) continue; + builder.AddTensor({2, 3}, static_cast(tensor_type), + {1, 2, 3, 4}, "input"); + builder.FinishModel({}, {}); + ASSERT_FALSE(builder.Verify()); + EXPECT_THAT( + builder.GetErrorString(), + ::testing::ContainsRegex("Tensor input requires .* bytes, but is " + "allocated with 4 bytes buffer")); + } +} + +TEST(VerifyModel, TypedTensorShapeMatchesTensorBufferSize) { + TfLiteFlatbufferModelBuilder builder; + for (int tensor_type = TensorType_MIN; tensor_type <= TensorType_MAX; + ++tensor_type) { + if (tensor_type == TensorType_STRING) continue; + TfLiteType lite_type = kTfLiteNoType; + ASSERT_EQ(ConvertTensorType(static_cast(tensor_type), + &lite_type, /*error_reporter=*/nullptr), + kTfLiteOk); + size_t size_bytes = 0; + ASSERT_EQ(GetSizeOfType(/*context=*/nullptr, lite_type, &size_bytes), + kTfLiteOk); + std::vector buffer(size_bytes); + builder.AddTensor({1}, static_cast(tensor_type), buffer, + "input"); + builder.FinishModel({}, {}); + ASSERT_TRUE(builder.Verify()); + } +} + // TODO(yichengfan): make up malicious files to test with. } // namespace tflite From fba5f43255563cd54f28cacd7ed8b88deb597891 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 19 Jul 2019 18:24:06 -0700 Subject: [PATCH 0219/3053] Improve error reporting for sparkfun edge, fix false error detection issue. PiperOrigin-RevId: 259073602 --- .../sparkfun_edge/image_provider.cc | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/image_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/image_provider.cc index f5da9865d55..6685d93cf44 100644 --- a/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/image_provider.cc +++ b/tensorflow/lite/experimental/micro/examples/micro_vision/sparkfun_edge/image_provider.cc @@ -143,6 +143,10 @@ TfLiteStatus InitCamera(tflite::ErrorReporter* error_reporter) { am_hal_gpio_pinconfig(HM01B0_PIN_DVDD_EN, g_AM_HAL_GPIO_OUTPUT_12); am_hal_gpio_output_set(HM01B0_PIN_DVDD_EN); + // Configure Red LED for debugging. + am_hal_gpio_pinconfig(AM_BSP_GPIO_LED_RED, g_AM_HAL_GPIO_OUTPUT_12); + am_hal_gpio_output_clear(AM_BSP_GPIO_LED_RED); + hm01b0_power_up(&s_HM01B0Cfg); // TODO(njeff): check the delay time to just fit the spec. @@ -153,22 +157,23 @@ TfLiteStatus InitCamera(tflite::ErrorReporter* error_reporter) { // TODO(njeff): check the delay time to just fit the spec. am_util_delay_ms(1); - hm01b0_init_if(&s_HM01B0Cfg); + if (HM01B0_ERR_OK != hm01b0_init_if(&s_HM01B0Cfg)) { + return kTfLiteError; + } - hm01b0_init_system(&s_HM01B0Cfg, (hm_script_t*)sHM01B0InitScript, - sizeof(sHM01B0InitScript) / sizeof(hm_script_t)); + if (HM01B0_ERR_OK != + hm01b0_init_system(&s_HM01B0Cfg, (hm_script_t*)sHM01B0InitScript, + sizeof(sHM01B0InitScript) / sizeof(hm_script_t))) { + return kTfLiteError; + } // Put camera into streaming mode - this makes it so that the camera // constantly captures images. It is still OK to read and image since the // camera uses a double-buffered input. This means there is always one valid // image to read while the other buffer fills. Streaming mode allows the // camera to perform auto exposure constantly. - am_hal_gpio_output_clear(AM_BSP_GPIO_LED_RED); - uint32_t error_code = - hm01b0_set_mode(&s_HM01B0Cfg, HM01B0_REG_MODE_SELECT_STREAMING, 0); - if (error_code == HM01B0_ERR_OK) { - am_hal_gpio_output_set(AM_BSP_GPIO_LED_RED); - + if (HM01B0_ERR_OK != + hm01b0_set_mode(&s_HM01B0Cfg, HM01B0_REG_MODE_SELECT_STREAMING, 0)) { return kTfLiteError; } @@ -182,6 +187,7 @@ TfLiteStatus GetImage(tflite::ErrorReporter* error_reporter, int frame_width, if (!g_is_camera_initialized) { TfLiteStatus init_status = InitCamera(error_reporter); if (init_status != kTfLiteOk) { + am_hal_gpio_output_set(AM_BSP_GPIO_LED_RED); return init_status; } // Drop a few frames until auto exposure is calibrated. From 21b4279e9550acc1c555144cb4ca335e03b5ac4f Mon Sep 17 00:00:00 2001 From: Rohan Jain Date: Fri, 19 Jul 2019 18:26:24 -0700 Subject: [PATCH 0220/3053] Improving snapshot's logging PiperOrigin-RevId: 259073737 --- .../kernels/data/experimental/snapshot_dataset_op.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc index eeaf5051294..4e1b3e31193 100644 --- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc +++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc @@ -567,9 +567,9 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel { kbytes_read_ += static_cast(num_bytes) / 1024.0; elements_produced_++; if (elements_produced_ % 10000 == 0) { - VLOG(2) << "Current read throughput (MBPS): " - << ((kbytes_read_ / 1024.0) / - (time_spent_micros_ / 1000000.0)); + LOG(INFO) << "Current read throughput (MBPS): " + << ((kbytes_read_ / 1024.0) / + (time_spent_micros_ / 1000000.0)); } } buffer_.pop_front(); @@ -802,9 +802,9 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel { elements_produced_++; if (elements_produced_ % 10000 == 0) { - VLOG(2) << "Current write throughput (MBPS): " - << (bytes_produced_ * 1000000.0) / - (time_spent_micros_ * 1024.0 * 1024.0); + LOG(INFO) << "Current write throughput (MBPS): " + << (bytes_produced_ * 1000000.0) / + (time_spent_micros_ * 1024.0 * 1024.0); } return Status::OK(); } From 5b2094bf58d0d148da30bac3796b3d05b344114b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 19 Jul 2019 18:37:56 -0700 Subject: [PATCH 0221/3053] Make delegate options compatible with python3. PiperOrigin-RevId: 259074693 --- tensorflow/lite/python/interpreter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py index f83a438f959..43b90883c8a 100644 --- a/tensorflow/lite/python/interpreter.py +++ b/tensorflow/lite/python/interpreter.py @@ -99,8 +99,8 @@ class Delegate(object): options_keys = (ctypes.c_char_p * len(options))() options_values = (ctypes.c_char_p * len(options))() for idx, (key, value) in enumerate(options.items()): - options_keys[idx] = str(key) - options_values[idx] = str(value) + options_keys[idx] = str(key).encode('utf-8') + options_values[idx] = str(value).encode('utf-8') class ErrorMessageCapture(object): From 3dfb34b6bd38e4d0cb78a5c5f89efd993db0b475 Mon Sep 17 00:00:00 2001 From: Andy Ly Date: Fri, 19 Jul 2019 18:50:48 -0700 Subject: [PATCH 0222/3053] [Grappler] Add `_FusedBatchNormEx` to GenericLayoutOptimizer. PiperOrigin-RevId: 259075903 --- tensorflow/core/grappler/op_types.cc | 4 ++ tensorflow/core/grappler/op_types.h | 1 + .../generic_layout_optimizer_transposer.cc | 52 ++++++++++++++++++- .../generic_layout_optimizer_transposer.h | 8 +++ ...ric_layout_optimizer_transposer_factory.cc | 4 ++ ...ayout_optimizer_transposer_factory_test.cc | 2 + 6 files changed, 69 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc index fcdd366487a..c4de79e7601 100644 --- a/tensorflow/core/grappler/op_types.cc +++ b/tensorflow/core/grappler/op_types.cc @@ -253,6 +253,10 @@ bool IsFusedBatchNorm(const NodeDef& node) { op == "FusedBatchNormV3"; } +bool IsFusedBatchNormEx(const NodeDef& node) { + return node.op() == "_FusedBatchNormEx"; +} + bool IsFusedBatchNormGrad(const NodeDef& node) { const auto& op = node.op(); return op == "FusedBatchNormGrad" || op == "FusedBatchNormGradV2" || diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h index d0562c32e4c..2b2ea5680fb 100644 --- a/tensorflow/core/grappler/op_types.h +++ b/tensorflow/core/grappler/op_types.h @@ -78,6 +78,7 @@ bool IsFill(const NodeDef& node); bool IsFloorDiv(const NodeDef& node); bool IsFloorMod(const NodeDef& node); bool IsFusedBatchNorm(const NodeDef& node); +bool IsFusedBatchNormEx(const NodeDef& node); bool IsFusedBatchNormGrad(const NodeDef& node); bool IsGreater(const NodeDef& node); bool IsGreaterEqual(const NodeDef& node); diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc index 2b4b4a4ca69..2b8a1eb8970 100644 --- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc +++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc @@ -648,6 +648,9 @@ Status DefaultLayoutSensitiveOpTransposer::TransposeNode( if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4)) { return Status::OK(); } + VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName() + << "' with op '" << node->GetOp() << "' from data format '" + << context->src_format << "' to '" << context->dst_format << "'"; TF_RETURN_IF_ERROR(UpdateNode(context, node)); TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {0}, node, kOpTranspose)); TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose)); @@ -660,6 +663,9 @@ Status AvgPoolGradTransposer::TransposeNode(TransposeContext* context, if (!ShouldProcess(*context, *node) || !IsFaninPortRankN(*node, 1, 4)) { return Status::OK(); } + VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName() + << "' with op '" << node->GetOp() << "' from data format '" + << context->src_format << "' to '" << context->dst_format << "'"; TF_RETURN_IF_ERROR(UpdateNode(context, node)); TF_RETURN_IF_ERROR( UpdateFaninEdgesWithOp(context, {0}, node, kOpDataFormatVecPermute)); @@ -674,6 +680,9 @@ Status BiasAddGradTransposer::TransposeNode(TransposeContext* context, if (!ShouldProcess(*context, *node) || !IsFaninPortRankN(*node, 0, 4)) { return Status::OK(); } + VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName() + << "' with op '" << node->GetOp() << "' from data format '" + << context->src_format << "' to '" << context->dst_format << "'"; TF_RETURN_IF_ERROR(UpdateNode(context, node)); TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {0}, node, kOpTranspose)); // No need to update output shape, as it is always of shape 1-D with size the @@ -689,6 +698,9 @@ Status Conv2DBackpropFilterTransposer::TransposeNode( if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4)) { return Status::OK(); } + VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName() + << "' with op '" << node->GetOp() << "' from data format '" + << context->src_format << "' to '" << context->dst_format << "'"; TF_RETURN_IF_ERROR(UpdateNode(context, node)); TF_RETURN_IF_ERROR( UpdateFaninEdgesWithOp(context, {0, 2}, node, kOpTranspose)); @@ -705,6 +717,9 @@ Status Conv2DBackpropInputTransposer::TransposeNode( if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4)) { return Status::OK(); } + VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName() + << "' with op '" << node->GetOp() << "' from data format '" + << context->src_format << "' to '" << context->dst_format << "'"; TF_RETURN_IF_ERROR(UpdateNode(context, node)); TF_RETURN_IF_ERROR( UpdateFaninEdgesWithOp(context, {0}, node, kOpDataFormatVecPermute)); @@ -713,6 +728,27 @@ Status Conv2DBackpropInputTransposer::TransposeNode( return context->graph_view->GetMutationBuilder()->Apply(); } +Status FusedBatchNormExTransposer::TransposeNode(TransposeContext* context, + utils::MutableNodeView* node) { + DCHECK(IsFusedBatchNormEx(*node->node())); + if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4)) { + return Status::OK(); + } + VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName() + << "' with op '" << node->GetOp() << "' from data format '" + << context->src_format << "' to '" << context->dst_format << "'"; + TF_RETURN_IF_ERROR(UpdateNode(context, node)); + if (node->NumRegularFanins() == 6) { + TF_RETURN_IF_ERROR( + UpdateFaninEdgesWithOp(context, {0, 5}, node, kOpTranspose)); + } else { + TF_RETURN_IF_ERROR( + UpdateFaninEdgesWithOp(context, {0}, node, kOpTranspose)); + } + TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose)); + return context->graph_view->GetMutationBuilder()->Apply(); +} + bool FusedBatchNormGradTransposer::IsTraining( const utils::MutableNodeView& node) const { const auto* is_training_attr = node.GetAttr(kAttrIsTraining); @@ -729,6 +765,9 @@ Status FusedBatchNormGradTransposer::TransposeNode( !IsTraining(*node)) { return Status::OK(); } + VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName() + << "' with op '" << node->GetOp() << "' from data format '" + << context->src_format << "' to '" << context->dst_format << "'"; TF_RETURN_IF_ERROR(UpdateNode(context, node)); TF_RETURN_IF_ERROR( UpdateFaninEdgesWithOp(context, {0, 1}, node, kOpTranspose)); @@ -748,6 +787,9 @@ Status MaxPoolV2Transposer::TransposeNode(TransposeContext* context, !IsFanoutPortRankN(*data_fanin_node, data_fanin.index(), 4)) { return Status::OK(); } + VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName() + << "' with op '" << node->GetOp() << "' from data format '" + << context->src_format << "' to '" << context->dst_format << "'"; TF_RETURN_IF_ERROR(UpdateNode(context, node)); TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {0}, node, kOpTranspose)); TF_RETURN_IF_ERROR( @@ -762,6 +804,9 @@ Status MaxPoolGradTransposer::TransposeNode(TransposeContext* context, if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4)) { return Status::OK(); } + VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName() + << "' with op '" << node->GetOp() << "' from data format '" + << context->src_format << "' to '" << context->dst_format << "'"; TF_RETURN_IF_ERROR(UpdateNode(context, node)); TF_RETURN_IF_ERROR( UpdateFaninEdgesWithOp(context, {0, 1, 2}, node, kOpTranspose)); @@ -775,6 +820,9 @@ Status MaxPoolGradV2Transposer::TransposeNode(TransposeContext* context, if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4)) { return Status::OK(); } + VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName() + << "' with op '" << node->GetOp() << "' from data format '" + << context->src_format << "' to '" << context->dst_format << "'"; TF_RETURN_IF_ERROR(UpdateNode(context, node)); TF_RETURN_IF_ERROR( UpdateFaninEdgesWithOp(context, {0, 1, 2}, node, kOpTranspose)); @@ -1607,8 +1655,8 @@ bool IsLayoutSensitiveOp(const NodeDef& node) { IsConv2DBackpropInput(node) || IsDepthwiseConv2dNativeBackpropFilter(node) || IsDepthwiseConv2dNativeBackpropInput(node) || - IsFusedBatchNormGrad(node) || IsMaxPoolV2(node) || - IsMaxPoolGrad(node) || IsMaxPoolGradV2(node) || + IsFusedBatchNormEx(node) || IsFusedBatchNormGrad(node) || + IsMaxPoolV2(node) || IsMaxPoolGrad(node) || IsMaxPoolGradV2(node) || IsMaxPoolGradGradV1(node) || IsMaxPoolGradGradV2(node); } diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h index 4da29e2e2d6..be609e84596 100644 --- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h +++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h @@ -239,6 +239,14 @@ class Conv2DBackpropInputTransposer : public LayoutSensitiveOpTransposer { utils::MutableNodeView* node) override; }; +class FusedBatchNormExTransposer : public LayoutSensitiveOpTransposer { + public: + explicit FusedBatchNormExTransposer() : LayoutSensitiveOpTransposer() {} + + Status TransposeNode(TransposeContext* context, + utils::MutableNodeView* node) override; +}; + class FusedBatchNormGradTransposer : public LayoutSensitiveOpTransposer { public: explicit FusedBatchNormGradTransposer() : LayoutSensitiveOpTransposer() {} diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc index bab17492a4a..59c06d42441 100644 --- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc +++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory.cc @@ -43,6 +43,10 @@ std::shared_ptr TransposerFactory::GetTransposer( return GetOrCreateIfNotFound( "Conv2DBackpropInput"); } + if (IsFusedBatchNormEx(node)) { + return GetOrCreateIfNotFound( + "FusedBatchNormEx"); + } if (IsFusedBatchNormGrad(node)) { return GetOrCreateIfNotFound( "FusedBatchNormGrad"); diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory_test.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory_test.cc index 9bc3dff3f71..2721b2f0d26 100644 --- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory_test.cc +++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer_factory_test.cc @@ -67,6 +67,8 @@ TEST(TransposerFactoryTest, SanityCheck) { CheckSameTransposerForOps({"BiasAddGrad"}, &factory, &transposers); + CheckSameTransposerForOps({"_FusedBatchNormEx"}, &factory, &transposers); + CheckSameTransposerForOps({"FusedBatchNormGrad", "FusedBatchNormGradV2"}, &factory, &transposers); From 72c73236693f83d15cbc4bd5dda851ba7f12738b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 19 Jul 2019 18:59:25 -0700 Subject: [PATCH 0223/3053] [TF:XLA] Don't print XLA:CPU warning when XLA autojit isn't enabled. This was noticed by #30308 PiperOrigin-RevId: 259076536 --- tensorflow/compiler/jit/mark_for_compilation_pass.cc | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc index b819998bdc7..91423f63d28 100644 --- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc +++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc @@ -1549,9 +1549,7 @@ StatusOr MarkForCompilationPassImpl::ShouldCompileClusterImpl( XlaOpRegistry::AutoclusteringPolicy::kIfEnabledGlobally && global_jit_level_ != OptimizerOptions::OFF); - if (!should_compile && - registration->autoclustering_policy == - XlaOpRegistry::AutoclusteringPolicy::kIfExplicitlyRequested && + if (!should_compile && global_jit_level_ != OptimizerOptions::OFF && device_type.type_string() == DEVICE_CPU) { static std::once_flag once; std::call_once(once, [] { From a54c84965cdc1ec7d6e7eacab5899e8d9305b760 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Fri, 19 Jul 2019 19:06:16 -0700 Subject: [PATCH 0224/3053] [XLA] Use the fast and lazy host callback to free tuple buffers PiperOrigin-RevId: 259077416 --- tensorflow/compiler/xla/service/generic_transfer_manager.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc index 2eae159861c..d65083d701a 100644 --- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc @@ -53,7 +53,7 @@ Status GenericTransferManager::WriteSingleTupleIndexTable( TF_RETURN_IF_ERROR(TransferBufferToDevice( stream, GetByteSizeRequirement(shape), element_pointers->data(), region)); // Ensure the buffer is transferred before we destroy element_pointers. - stream->ThenDoHostCallback([element_pointers]() { + stream->ThenRunAfterNextBlockHostUntilDone([element_pointers]() { /* holds reference to element_pointers in closure */ }); return Status::OK(); From 34ded7e51b0456f4615a787b8a9215e48744f4d7 Mon Sep 17 00:00:00 2001 From: Smit Hinsu Date: Fri, 19 Jul 2019 19:17:54 -0700 Subject: [PATCH 0225/3053] NFC: Move IfOp and WhileOp to the Op Definition Generation framework. * Remove TensorFlowOp trait as there are no remaining users. Auto-generated verifier handles all the checks in the trait. * Remove verification for function attributes existence from the custom verifier as auto-generated verifier already checks that. PiperOrigin-RevId: 259078224 --- .../mlir/lite/flatbuffer_translate.cc | 8 +- .../transforms/lower_static_tensor_list.cc | 4 +- .../compiler/mlir/tensorflow/ir/tf_ops.cc | 95 ++++++------------ .../compiler/mlir/tensorflow/ir/tf_ops.h | 96 ------------------- .../compiler/mlir/tensorflow/ir/tf_ops.td | 83 ++++++++++++++++ .../mlir/tensorflow/tests/tf-ops.mlir | 4 +- .../functional_control_flow_to_cfg.cc | 10 +- 7 files changed, 127 insertions(+), 173 deletions(-) diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc index fca80f836aa..c6a461d7414 100644 --- a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc +++ b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc @@ -545,8 +545,8 @@ Optional> Translator::BuildTensor( } CustomOptionsOffset Translator::CreateIfOpCustomOptions(mlir::TF::IfOp op) { - int then_subgraph_index = subgraph_index_map_.at(op.getThen().str()); - int else_subgraph_index = subgraph_index_map_.at(op.getElse().str()); + int then_subgraph_index = subgraph_index_map_.at(op.then_branch().str()); + int else_subgraph_index = subgraph_index_map_.at(op.else_branch().str()); auto flex_builder = absl::make_unique(); flex_builder->Map([&]() { @@ -559,8 +559,8 @@ CustomOptionsOffset Translator::CreateIfOpCustomOptions(mlir::TF::IfOp op) { CustomOptionsOffset Translator::CreateWhileOpCustomOptions( mlir::TF::WhileOp op) { - int cond_subgraph_index = subgraph_index_map_.at(op.getCond().str()); - int body_subgraph_index = subgraph_index_map_.at(op.getBody().str()); + int cond_subgraph_index = subgraph_index_map_.at(op.cond().str()); + int body_subgraph_index = subgraph_index_map_.at(op.body().str()); auto flex_builder = absl::make_unique(); flex_builder->Map([&]() { diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc index 44ff796b7cc..f8831ef08e8 100644 --- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc +++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc @@ -276,8 +276,8 @@ LogicalResult LowerStaticTensorListPass::UpdateWhileFunctionType( auto *context = &getContext(); auto module = getModule(); - FuncOp cond_func = module.lookupSymbol(while_op->getCond()); - FuncOp body_func = module.lookupSymbol(while_op->getBody()); + FuncOp cond_func = module.lookupSymbol(while_op->cond()); + FuncOp body_func = module.lookupSymbol(while_op->body()); if (cond_func) { // Change `cond_func`'s argument types to `unranked_argument_types`. diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc index f01306fe259..3e62dd786ec 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc @@ -282,45 +282,39 @@ static LogicalResult Verify(FusedBatchNormOp op) { // IfOp //===----------------------------------------------------------------------===// -LogicalResult IfOp::verify() { - auto thenAttr = getAttrOfType("then_branch"); - if (!thenAttr) return emitOpError("requires then_branch attribute"); - - auto elseAttr = getAttrOfType("else_branch"); - if (!elseAttr) return emitOpError("requires else_branch attribute"); - - auto module = getParentOfType(); - auto thenFn = module.lookupSymbol(thenAttr.getValue()); +static LogicalResult Verify(IfOp op) { + auto module = op.getParentOfType(); + auto thenFn = module.lookupSymbol(op.then_branch()); if (!thenFn) - return emitOpError("then_branch refers to an undefined function : ") - << thenAttr; - auto elseFn = module.lookupSymbol(elseAttr.getValue()); + return op.emitOpError("then_branch refers to an undefined function : ") + << op.then_branch(); + auto elseFn = module.lookupSymbol(op.else_branch()); if (!elseFn) - return emitOpError("else_branch refers to an undefined function : ") - << elseAttr; + return op.emitOpError("else_branch refers to an undefined function : ") + << op.else_branch(); auto thenFuncType = thenFn.getType(); auto elseFuncType = elseFn.getType(); // Non-conditional operands starting with the second operand are passed to // branches and should be pair-wise compatible with branches' inputs. - unsigned expectedNumInputs = getNumOperands() - 1; + unsigned expectedNumInputs = op.getNumOperands() - 1; if (thenFuncType.getNumInputs() != expectedNumInputs || elseFuncType.getNumInputs() != expectedNumInputs) - return emitError("branches should have " + Twine(expectedNumInputs) + - " inputs"); + return op.emitError("branches should have " + Twine(expectedNumInputs) + + " inputs"); for (unsigned i = 0; i < expectedNumInputs; ++i) { - auto operandType = getOperand(i + 1)->getType().cast(); + auto operandType = op.getOperand(i + 1)->getType().cast(); auto thenInputType = thenFuncType.getInput(i).cast(); if (!AreCastCompatible(operandType, thenInputType)) - return emitError( + return op.emitError( llvm::formatv("then branch input type {0} is incompatible with " "operand type {1} at index {2}", thenInputType, operandType, i)); auto elseInputType = elseFuncType.getInput(i).cast(); if (!AreCastCompatible(operandType, elseInputType)) - return emitError( + return op.emitError( llvm::formatv("else branch input type {0} is incompatible with " "operand type {1} at index {2}", elseInputType, operandType, i)); @@ -328,30 +322,30 @@ LogicalResult IfOp::verify() { // If branches have incompatible input types that means that no tensor can // serve as input to both the functions. Hence, the op is invalid. if (!AreCastCompatible(thenInputType, elseInputType)) - return emitError(llvm::formatv( + return op.emitError(llvm::formatv( "branches inputs have incompatible types {0} and {1} at index {2}", thenInputType, elseInputType, i)); } // Branches' results should be pair-wise compatible with the op results. - unsigned expectedNumResults = getNumResults(); + unsigned expectedNumResults = op.getNumResults(); if (thenFuncType.getNumResults() != expectedNumResults || elseFuncType.getNumResults() != expectedNumResults) - return emitError("branches should have " + Twine(expectedNumResults) + - " results"); + return op.emitError("branches should have " + Twine(expectedNumResults) + + " results"); for (unsigned i = 0; i < expectedNumResults; ++i) { - auto resultType = getResult(i)->getType().cast(); + auto resultType = op.getResult(i)->getType().cast(); auto thenResultType = thenFuncType.getResult(i).cast(); if (!AreCastCompatible(thenResultType, resultType)) - return emitError( + return op.emitError( llvm::formatv("then branch result type {0} is incompatible with op " "result type {1} at index {2}", thenResultType, resultType, i)); auto elseResultType = elseFuncType.getResult(i).cast(); if (!AreCastCompatible(elseResultType, resultType)) - return emitError( + return op.emitError( llvm::formatv("else branch result type {0} is incompatible with op " "result type {1} at index {2}", elseResultType, resultType, i)); @@ -734,25 +728,20 @@ void TruncateDivOp::getCanonicalizationPatterns( // WhileOp //===----------------------------------------------------------------------===// -LogicalResult WhileOp::verify() { - auto condAttr = getAttrOfType("cond"); - if (!condAttr) return emitOpError("requires cond attribute"); - - auto module = getParentOfType(); - auto condFn = module.lookupSymbol(condAttr.getValue()); +static LogicalResult Verify(WhileOp op) { + auto module = op.getParentOfType(); + auto condFn = module.lookupSymbol(op.cond()); auto condFuncType = condFn.getType(); // Verify that the cond function has exactly one result. if (condFuncType.getNumResults() != 1) - return emitOpError("requires cond function to have exactly one result"); + return op.emitOpError("requires cond function to have exactly one result"); - auto bodyAttr = getAttrOfType("body"); - if (!bodyAttr) return emitOpError("requires body attribute"); - auto bodyFn = module.lookupSymbol(bodyAttr.getValue()); + auto bodyFn = module.lookupSymbol(op.body()); auto bodyFuncType = bodyFn.getType(); - SmallVector operands(getOperandTypes()); - SmallVector results(getResultTypes()); + SmallVector operands(op.getOperandTypes()); + SmallVector results(op.getResultTypes()); // Collect all the type lists for the op so that different pairs of type lists // can be compared for the compatibility. @@ -796,7 +785,7 @@ LogicalResult WhileOp::verify() { int aSize = a.second.size(); if (aSize != b.second.size()) - return emitOpError( + return op.emitOpError( llvm::formatv("requires the number of {0}s to be equal to the " "number of {1}s. Found {2} and {3}, respectively", a.first, b.first, aSize, b.second.size())); @@ -806,7 +795,7 @@ LogicalResult WhileOp::verify() { auto bType = b.second[idx]; if (!AreCastCompatible(aType, bType)) - return emitError(llvm::formatv( + return op.emitError(llvm::formatv( "{0} type {1} is incompatible with {2} type {3} at index {4}", a.first, aType, b.first, bType, idx)); } @@ -840,7 +829,7 @@ TensorFlowDialect::TensorFlowDialect(MLIRContext *context) addOperations< #define GET_OP_LIST #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc.inc" - , IfOp, WhileOp>(); + >(); addTypes< #define HANDLE_TF_TYPE(tftype, enumerant, name) tftype##Type, #define HANDLE_LAST_TF_TYPE(tftype, enumerant, name) tftype##Type @@ -954,27 +943,5 @@ Operation *TensorFlowDialect::materializeConstant(OpBuilder &builder, return nullptr; } -// Verifies that the Op is a well-formed TensorFlow op, checking that all inputs -// and results are Tensor or other TensorFlow types, etc. -LogicalResult verifyTensorFlowOp(Operation *op) { - if (op->getName().getDialect() != "tf") - return op->emitError("TensorFlow op ") - << op->getName() << " should start with 'tf.'"; - - for (Type type : op->getOperandTypes()) { - if (!IsValidTFTensorType(type)) - return op->emitOpError( - "requires operands to have a valid TensorFlow tensor type"); - } - - for (Type type : op->getResultTypes()) { - if (!IsValidTFTensorType(type)) - return op->emitOpError( - "requires results to have a valid TensorFlow tensor type"); - } - - return success(); -} - } // namespace TF } // namespace mlir diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h index 723aa67c6c4..7885a8e6199 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h @@ -64,20 +64,6 @@ class TensorFlowDialect : public Dialect { Location loc) override; }; -// This verifies that the Op is a well-formed TensorFlow op, checking -// that all inputs and results are Tensor or other TensorFlow types, etc. -static LogicalResult verifyTensorFlowOp(Operation *op); - -// This Trait should be used by all TensorFlow Ops. -// -template -class TensorFlowOp : public OpTrait::TraitBase { - public: - static LogicalResult verifyTrait(Operation *op) { - return verifyTensorFlowOp(op); - } -}; - // TODO(b/131258166): TensorFlow's mutex.h defines a `mutex_lock` macro, whose // purpose is to catch bug on `tensorflow::mutex_lock`. We don't use // `tensorflow::mutex_lock` here but we have ops (`tf.MutexLock` and @@ -89,88 +75,6 @@ class TensorFlowOp : public OpTrait::TraitBase { #define GET_OP_CLASSES #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h.inc" -// The "tf.If" operation takes a condition operand, a list of inputs, and a -// function attribute for the then/else branches. The condition operand -// doesn't have to be a boolean tensor. It is handled according to these -// rules, quoting the TensorFlow op definition: -// -// If the tensor is a scalar of non-boolean type, the scalar is converted to -// a boolean according to the following rule: if the scalar is a numerical -// value, non-zero means True and zero means False; if the scalar is a -// string, non-empty means True and empty means False. If the tensor is not a -// scalar, being empty means False and being non-empty means True. -// -// This is defined in TensorFlow as: -// -// REGISTER_OP("If") -// .Input("cond: Tcond") -// .Input("input: Tin") -// .Output("output: Tout") -// .Attr("Tcond: type") -// .Attr("Tin: list(type) >= 0") -// .Attr("Tout: list(type) >= 0") -// .Attr("then_branch: func") -// .Attr("else_branch: func") -// -// Note: Additional result corresponds to the control output. -class IfOp : public Op::Impl, - OpTrait::VariadicResults> { - public: - using Op::Op; - static StringRef getOperationName() { return "tf.If"; } - - Value *getCondition() { return getOperand(0); } - - // TODO(b/132271680): This is not following Google naming style - StringRef getThen() { - return getAttrOfType("then_branch").getValue(); - } - - StringRef getElse() { - return getAttrOfType("else_branch").getValue(); - } - - LogicalResult verify(); -}; - -// The "tf.While" operation takes a list of inputs and function attributes for -// the loop condition and body. Inputs are updated repeatedly by the body -// function while the loop condition with the tensors evaluates to true. The -// condition result doesn't have to be a boolean tensor. It is handled -// according to these rules, quoting the TensorFlow op definition: -// -// If the tensor is a scalar of non-boolean type, the scalar is converted to -// a boolean according to the following rule: if the scalar is a numerical -// value, non-zero means True and zero means False; if the scalar is a -// string, non-empty means True and empty means False. If the tensor is not a -// scalar, being empty means False and being non-empty means True. -// -// This is defined in TensorFlow as: -// -// REGISTER_OP("While") -// .Input("input: T") -// .Output("output: T") -// .Attr("T: list(type) >= 0") -// .Attr("cond: func") -// .Attr("body: func") -// .Attr("output_shapes: list(shape) = []") -// -class WhileOp : public Op { - public: - using Op::Op; - static StringRef getOperationName() { return "tf.While"; } - - StringRef getCond() { - return getAttrOfType("cond").getValue(); - } - StringRef getBody() { - return getAttrOfType("body").getValue(); - } - - LogicalResult verify(); -}; - } // namespace TF } // namespace mlir diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td index b2fcb01c2d5..d920f471bbf 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td @@ -78,6 +78,47 @@ Returns a tensor with the same shape and contents as input. TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>; } +def TF_IfOp : TF_Op<"If", []> { + let summary = "output = cond ? then_branch(input) : else_branch(input)"; + + let description = [{ +output = cond ? then_branch(input) : else_branch(input) + +cond: A Tensor. If the tensor is a scalar of non-boolean type, the + scalar is converted to a boolean according to the + following rule: if the scalar is a numerical value, non-zero means + True and zero means False; if the scalar is a string, non-empty + means True and empty means False. If the tensor is not a scalar, + being empty means False and being non-empty means True. +input: A list of input tensors. +then_branch: A function that takes 'inputs' and returns a list of + tensors, whose types are the same as what else_branch returns. +else_branch: A function that takes 'inputs' and returns a list of + tensors. whose types are the same as what then_branch returns. + }]; + + let arguments = (ins + TF_Tensor:$cond, + Variadic:$input, + + SymbolRefAttr:$then_branch, + SymbolRefAttr:$else_branch, + DefaultValuedAttr:$output_shapes + ); + + let results = (outs + Variadic:$output + ); + + TF_DerivedOperandTypeAttr Tcond = TF_DerivedOperandTypeAttr<0>; + TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<1>; + TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>; + + let verifier = [{ + return Verify(*this); + }]; +} + def TF_MeanOp : TF_Op<"Mean", [NoSideEffect]> { let summary = "Computes the mean of elements across dimensions of a tensor."; @@ -192,4 +233,46 @@ element_dtype: the desired type of elements in the list. }]; } +def TF_WhileOp : TF_Op<"While", []> { + let summary = [{ +output = input; While (Cond(output)) { output = Body(output) } + }]; + + let description = [{ +output = input; While (Cond(output)) { output = Body(output) } + +input: A list of input tensors whose types are T. +output: A list of output tensors whose types are T. +cond: A function takes 'input' and returns a tensor. If the tensor is + a scalar of non-boolean, the scalar is converted to a boolean + according to the following rule: if the scalar is a numerical + value, non-zero means True and zero means False; if the scalar is + a string, non-empty means True and empty means False. If the + tensor is not a scalar, non-emptiness means True and False + otherwise. +body: A function that takes a list of tensors and returns another + list of tensors. Both lists have the same types as specified + by T. + }]; + + let arguments = (ins + Variadic:$input, + + SymbolRefAttr:$cond, + SymbolRefAttr:$body, + DefaultValuedAttr:$output_shapes, + DefaultValuedAttr:$parallel_iterations + ); + + let results = (outs + Variadic:$output + ); + + TF_DerivedOperandTypeListAttr T = TF_DerivedOperandTypeListAttr<0>; + + let verifier = [{ + return Verify(*this); + }]; +} + #endif // TF_OPS diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir index 53b773f959d..f1c480049e3 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir +++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir @@ -500,7 +500,7 @@ func @testIfElse(f32) -> f32 // Test invalid tf.If operation func @testInvalidIfOp(tensor, f32) -> f32 { ^bb0(%arg0: tensor, %arg1: f32): - // expected-error @+1 {{requires operands to have a valid TensorFlow tensor type}} + // expected-error @+1 {{operand #1 must be tensor of tf.dtype values}} %1 = "tf.If"(%arg0, %arg1) { then_branch = @testIfThen, else_branch = @testIfElse @@ -516,7 +516,7 @@ func @testIfElse(tensor<2xf32>) -> tensor<2xf32> // Test invalid tf.If operation func @testInvalidIfOp(tensor, tensor<2xf32>) -> tensor<2xf32> { ^bb0(%arg0: tensor, %arg1: tensor<2xf32>): - // expected-error @+1 {{requires then_branch attribute}} + // expected-error @+1 {{requires attribute 'then_branch'}} %1 = "tf.If"(%arg0, %arg1) { else_branch = @testIfElse } : (tensor, tensor<2xf32>) -> tensor<2xf32> diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc index af3e1e05ade..bc9ed1111df 100644 --- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc +++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc @@ -150,12 +150,12 @@ static LogicalResult LowerIfOp(IfOp op) { OpBuilder builder(op_inst); // Lower the condition to a boolean value (i1). - Value* cond_i1 = LowerCondition(loc, op.getCondition(), &builder); + Value* cond_i1 = LowerCondition(loc, op.cond(), &builder); if (!cond_i1) return failure(); auto module = op_inst->getParentOfType(); - auto then_fn = module.lookupSymbol(op.getThen()); - auto else_fn = module.lookupSymbol(op.getElse()); + auto then_fn = module.lookupSymbol(op.then_branch()); + auto else_fn = module.lookupSymbol(op.else_branch()); // Split the basic block before the 'if'. The new dest will be our merge // point. @@ -211,8 +211,8 @@ static LogicalResult LowerWhileOp(WhileOp op) { OpBuilder builder(op_inst); auto module = op_inst->getParentOfType(); - auto cond_fn = module.lookupSymbol(op.getCond()); - auto body_fn = module.lookupSymbol(op.getBody()); + auto cond_fn = module.lookupSymbol(op.cond()); + auto body_fn = module.lookupSymbol(op.body()); // Split the block containing the While op into two blocks. One containing // operations before the While op and other containing the rest. Create two From 50885ca14158a14f520c5c8bd39a3575b9e10fff Mon Sep 17 00:00:00 2001 From: Sachin Joglekar Date: Fri, 19 Jul 2019 21:11:01 -0700 Subject: [PATCH 0226/3053] Adds debug mode to COCO object detection script PiperOrigin-RevId: 259085857 --- ...bject_detection_average_precision_stage.cc | 12 ++-- ...object_detection_average_precision_stage.h | 9 +-- .../stages/object_detection_stage.cc | 6 +- .../stages/object_detection_stage.h | 9 +++ .../tasks/coco_object_detection/README.md | 17 +++++- .../tasks/coco_object_detection/run_eval.cc | 60 ++++++++++++++----- 6 files changed, 83 insertions(+), 30 deletions(-) diff --git a/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.cc b/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.cc index a8c301df65a..cfb9a300281 100644 --- a/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.cc +++ b/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.cc @@ -57,26 +57,26 @@ TfLiteStatus ObjectDetectionAveragePrecisionStage::Init() { } TfLiteStatus ObjectDetectionAveragePrecisionStage::Run() { - for (int i = 0; i < ground_truth_objects_.objects_size(); ++i) { - const int class_id = ground_truth_objects_.objects(i).class_id(); + for (int i = 0; i < ground_truth_objects_->objects_size(); ++i) { + const int class_id = ground_truth_objects_->objects(i).class_id(); if (class_id >= num_classes_) { LOG(ERROR) << "Encountered invalid class ID: " << class_id; return kTfLiteError; } ground_truth_object_vectors_[class_id].push_back(ConvertProtoToDetection( - ground_truth_objects_.objects(i), current_image_index_)); + ground_truth_objects_->objects(i), current_image_index_)); } - for (int i = 0; i < predicted_objects_.objects_size(); ++i) { - const int class_id = predicted_objects_.objects(i).class_id(); + for (int i = 0; i < predicted_objects_->objects_size(); ++i) { + const int class_id = predicted_objects_->objects(i).class_id(); if (class_id >= num_classes_) { LOG(ERROR) << "Encountered invalid class ID: " << class_id; return kTfLiteError; } predicted_object_vectors_[class_id].push_back(ConvertProtoToDetection( - predicted_objects_.objects(i), current_image_index_)); + predicted_objects_->objects(i), current_image_index_)); } current_image_index_++; diff --git a/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.h b/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.h index 16b04827ae5..cf230ce697b 100644 --- a/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.h +++ b/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.h @@ -42,16 +42,17 @@ class ObjectDetectionAveragePrecisionStage : public EvaluationStage { EvaluationStageMetrics LatestMetrics() override; // Call before Run(). + // Both protos must outlive the call to Run(). void SetEvalInputs(const ObjectDetectionResult& predicted_objects, const ObjectDetectionResult& ground_truth_objects) { - predicted_objects_ = predicted_objects; - ground_truth_objects_ = ground_truth_objects; + predicted_objects_ = &predicted_objects; + ground_truth_objects_ = &ground_truth_objects; } private: int num_classes_ = -1; - ObjectDetectionResult predicted_objects_; - ObjectDetectionResult ground_truth_objects_; + const ObjectDetectionResult* predicted_objects_; + const ObjectDetectionResult* ground_truth_objects_; int current_image_index_ = 0; // One inner vector per class for ground truth objects. diff --git a/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc b/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc index b4e3401eff0..869d095e726 100644 --- a/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc +++ b/tensorflow/lite/tools/evaluation/stages/object_detection_stage.cc @@ -109,7 +109,7 @@ TfLiteStatus ObjectDetectionStage::Run() { TF_LITE_ENSURE_STATUS(inference_stage_->Run()); // Convert model output to ObjectsSet. - ObjectDetectionResult predicted_objects; + predicted_objects_.Clear(); const int class_offset = config_.specification().object_detection_params().class_offset(); const std::vector* outputs = inference_stage_->GetOutputs(); @@ -119,7 +119,7 @@ TfLiteStatus ObjectDetectionStage::Run() { float* detected_label_probabilities = static_cast(outputs->at(2)); for (int i = 0; i < num_detections; ++i) { const int bounding_box_offset = i * 4; - auto* object = predicted_objects.add_objects(); + auto* object = predicted_objects_.add_objects(); // Bounding box auto* bbox = object->mutable_bounding_box(); bbox->set_normalized_top(detected_label_boxes[bounding_box_offset + 0]); @@ -134,7 +134,7 @@ TfLiteStatus ObjectDetectionStage::Run() { } // AP Evaluation. - eval_stage_->SetEvalInputs(predicted_objects, *ground_truth_objects_); + eval_stage_->SetEvalInputs(predicted_objects_, *ground_truth_objects_); TF_LITE_ENSURE_STATUS(eval_stage_->Run()); return kTfLiteOk; diff --git a/tensorflow/lite/tools/evaluation/stages/object_detection_stage.h b/tensorflow/lite/tools/evaluation/stages/object_detection_stage.h index ec9772754eb..cc0c935bba9 100644 --- a/tensorflow/lite/tools/evaluation/stages/object_detection_stage.h +++ b/tensorflow/lite/tools/evaluation/stages/object_detection_stage.h @@ -70,13 +70,22 @@ class ObjectDetectionStage : public EvaluationStage { return inference_stage_.get(); } + // Returns a const pointer to the latest inference output. + const ObjectDetectionResult* GetLatestPrediction() { + return &predicted_objects_; + } + private: const std::vector* all_labels_ = nullptr; std::unique_ptr preprocessing_stage_; std::unique_ptr inference_stage_; std::unique_ptr eval_stage_; std::string image_path_; + + // Obtained from SetInputs(...). const ObjectDetectionResult* ground_truth_objects_; + // Reflects the outputs generated from the latest call to Run(). + ObjectDetectionResult predicted_objects_; }; // Reads a tflite::evaluation::ObjectDetectionGroundTruth instance from a diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md index db4e00d8f81..aa7905a2996 100644 --- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md +++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/README.md @@ -110,7 +110,7 @@ TFLite * `ground_truth_proto`: `string` \ Path to file containing tflite::evaluation::ObjectDetectionGroundTruth proto - in text format. + in text format. If left empty, mAP numbers are not provided. The above two parameters can be prepared using the `preprocess_coco_minival` script included in this folder. @@ -129,6 +129,21 @@ The following optional parameters can be used to modify the inference runtime: If provided, tries to use the specified delegate for accuracy evaluation. Valid values: "nnapi", "gpu". +### Debug Mode + +The script also supports a debug mode with the following parameter: + +* `debug_mode`: `boolean` \ + Whether to enable debug mode. Per-image predictions are written to the + output file along with metrics. NOTE: Its not possible to parse the output + file as a proto in this mode, since it contains demarcations between + per-file outputs for readability. + +This mode lets you debug the output of an object detection model that isn't +necessarily trained on the COCO dataset (by leaving `ground_truth_proto` empty). +The model output signature would still need to follow the convention mentioned +above, and you we still need an output labels file. + ## Preprocessing the minival dataset To compute mAP in a consistent and interpretable way, we utilize the same 2014 diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc index 3479ee48311..470fb8e7f00 100644 --- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc +++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc @@ -34,6 +34,7 @@ constexpr char kModelOutputLabelsFlag[] = "model_output_labels"; constexpr char kOutputFilePathFlag[] = "output_file_path"; constexpr char kGroundTruthProtoFileFlag[] = "ground_truth_proto"; constexpr char kInterpreterThreadsFlag[] = "num_interpreter_threads"; +constexpr char kDebugModeFlag[] = "debug_mode"; constexpr char kDelegateFlag[] = "delegate"; constexpr char kNnapiDelegate[] = "nnapi"; constexpr char kGpuDelegate[] = "gpu"; @@ -49,7 +50,7 @@ bool EvaluateModel(const std::string& model_file_path, const std::vector& image_paths, const std::string& ground_truth_proto_file, std::string delegate, std::string output_file_path, - int num_interpreter_threads) { + int num_interpreter_threads, bool debug_mode) { EvaluationStageConfig eval_config; eval_config.set_name("object_detection"); auto* detection_params = @@ -65,27 +66,47 @@ bool EvaluateModel(const std::string& model_file_path, // Get ground truth data. absl::flat_hash_map ground_truth_map; - PopulateGroundTruth(ground_truth_proto_file, &ground_truth_map); + if (!ground_truth_proto_file.empty()) { + PopulateGroundTruth(ground_truth_proto_file, &ground_truth_map); + } ObjectDetectionStage eval(eval_config); eval.SetAllLabels(model_labels); if (eval.Init() != kTfLiteOk) return false; + // Open output file for writing. + std::ofstream ofile; + ofile.open(output_file_path, std::ios::out); + const int step = image_paths.size() / 100; for (int i = 0; i < image_paths.size(); ++i) { if (step > 1 && i % step == 0) { LOG(INFO) << "Finished: " << i / step << "%"; } - eval.SetInputs(image_paths[i], - ground_truth_map[GetNameFromPath(image_paths[i])]); + + const std::string image_name = GetNameFromPath(image_paths[i]); + eval.SetInputs(image_paths[i], ground_truth_map[image_name]); if (eval.Run() != kTfLiteOk) return false; + + if (debug_mode) { + ObjectDetectionResult prediction = *eval.GetLatestPrediction(); + prediction.set_image_name(image_name); + ofile << prediction.DebugString(); + ofile << "======================================================\n"; + } } - std::ofstream metrics_ofile; - metrics_ofile.open(output_file_path, std::ios::out); - metrics_ofile << eval.LatestMetrics().DebugString(); - metrics_ofile.close(); + // Write metrics to file. + EvaluationStageMetrics metrics = eval.LatestMetrics(); + if (ground_truth_proto_file.empty()) { + // mAP metrics are meaningless for no ground truth. + metrics.mutable_process_metrics() + ->mutable_object_detection_metrics() + ->clear_average_precision_metrics(); + } + ofile << metrics.DebugString(); + ofile.close(); return true; } @@ -99,6 +120,7 @@ int Main(int argc, char* argv[]) { std::string output_file_path; std::string delegate; int num_interpreter_threads = 1; + bool debug_mode; std::vector flag_list = { tflite::Flag::CreateFlag(kModelFileFlag, &model_file_path, "Path to test tflite model file."), @@ -112,13 +134,19 @@ int Main(int argc, char* argv[]) { kGroundTruthImagesPathFlag, &ground_truth_images_path, "Path to ground truth images. These will be evaluated in " "alphabetical order of filenames"), - tflite::Flag::CreateFlag(kGroundTruthProtoFileFlag, - &ground_truth_proto_file, - "Path to file containing " - "tflite::evaluation::ObjectDetectionGroundTruth " - "proto in text format"), - tflite::Flag::CreateFlag(kOutputFilePathFlag, &output_file_path, - "File to output metrics proto to."), + tflite::Flag::CreateFlag( + kGroundTruthProtoFileFlag, &ground_truth_proto_file, + "Path to file containing " + "tflite::evaluation::ObjectDetectionGroundTruth " + "proto in text format. If left empty, mAP numbers are not output."), + tflite::Flag::CreateFlag( + kOutputFilePathFlag, &output_file_path, + "File to output to. Contains only metrics proto if debug_mode is " + "off, and per-image predictions also otherwise."), + tflite::Flag::CreateFlag(kDebugModeFlag, &debug_mode, + "Whether to enable debug mode. Per-image " + "predictions are written to the output file " + "along with metrics."), tflite::Flag::CreateFlag( kInterpreterThreadsFlag, &num_interpreter_threads, "Number of interpreter threads to use for inference."), @@ -141,7 +169,7 @@ int Main(int argc, char* argv[]) { if (!EvaluateModel(model_file_path, model_labels, image_paths, ground_truth_proto_file, delegate, output_file_path, - num_interpreter_threads)) { + num_interpreter_threads, debug_mode)) { LOG(ERROR) << "Could not evaluate model"; return 0; } From ed87a6ddc3b93b59fc0d5b3b358004be8ea1036e Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Fri, 19 Jul 2019 22:05:05 -0700 Subject: [PATCH 0227/3053] Allow users of Node[Def]Builder to avoid copying the created NodeDef on finalization. By passing true as the optional `consume` argument, we can move the constructed NodeDef out of the NodeDefBuilder, which avoids a potentially large copy. PiperOrigin-RevId: 259089263 --- tensorflow/core/framework/node_def_builder.cc | 8 +- tensorflow/core/framework/node_def_builder.h | 4 +- .../core/framework/node_def_builder_test.cc | 10 ++- .../core/framework/node_def_util_test.cc | 75 ++++++++++--------- tensorflow/core/graph/graph_partition.cc | 18 ++--- tensorflow/core/graph/node_builder.cc | 4 +- tensorflow/core/graph/node_builder.h | 4 +- tensorflow/core/graph/subgraph.cc | 8 +- 8 files changed, 74 insertions(+), 57 deletions(-) diff --git a/tensorflow/core/framework/node_def_builder.cc b/tensorflow/core/framework/node_def_builder.cc index 6a25114e6dc..58f79bd3657 100644 --- a/tensorflow/core/framework/node_def_builder.cc +++ b/tensorflow/core/framework/node_def_builder.cc @@ -211,7 +211,7 @@ NodeDefBuilder& NodeDefBuilder::Device(StringPiece device_spec) { return *this; } -Status NodeDefBuilder::Finalize(NodeDef* node_def) const { +Status NodeDefBuilder::Finalize(NodeDef* node_def, bool consume) { const std::vector* errors_ptr = &errors_; std::vector errors_storage; if (op_def_ != nullptr && inputs_specified_ < op_def_->input_arg_size()) { @@ -243,7 +243,11 @@ Status NodeDefBuilder::Finalize(NodeDef* node_def) const { } else { NodeDef node_def_backup; if (node_def == nullptr) node_def = &node_def_backup; - *node_def = node_def_; + if (consume) { + *node_def = std::move(node_def_); + } else { + *node_def = node_def_; + } // Add control inputs after the regular inputs. for (const auto& control_input : control_inputs_) { diff --git a/tensorflow/core/framework/node_def_builder.h b/tensorflow/core/framework/node_def_builder.h index 63d856d16c6..92d6399d1e2 100644 --- a/tensorflow/core/framework/node_def_builder.h +++ b/tensorflow/core/framework/node_def_builder.h @@ -129,9 +129,11 @@ class NodeDefBuilder { // Finish building the NodeDef, returning any errors or setting // *node_def if none. + // If `consume` is true, the builder state will be moved into `node_def`, + // and the builder will be left in an undefined state. // WARNING: Not all problems are detected! The resulting NodeDef may // not be valid! Call ValidateNodeDef() from node_def_utils to be sure. - Status Finalize(NodeDef* node_def) const; + Status Finalize(NodeDef* node_def, bool consume = false); // Accessors for the values set in the constructor. const string& node_name() const { return node_def_.name(); } diff --git a/tensorflow/core/framework/node_def_builder_test.cc b/tensorflow/core/framework/node_def_builder_test.cc index 7c4426e276a..d93f8e9e2d8 100644 --- a/tensorflow/core/framework/node_def_builder_test.cc +++ b/tensorflow/core/framework/node_def_builder_test.cc @@ -48,7 +48,7 @@ class NodeDefBuilderTest : public ::testing::Test { // Calls Finalize() and verifies it returns success and the result matches // expectations. - void ExpectSuccess(const NodeDefBuilder& builder, + void ExpectSuccess(NodeDefBuilder& builder, // NOLINT DataTypeSlice expected_in_types, DataTypeSlice expected_out_types, StringPiece proto) { NodeDef node_def; @@ -76,7 +76,7 @@ class NodeDefBuilderTest : public ::testing::Test { // Calls Finalize() and verifies it returns an error. // Each message must appear as a substring of the error. - void ExpectFailures(const NodeDefBuilder& builder, + void ExpectFailures(NodeDefBuilder& builder, // NOLINT const std::vector& messages) { NodeDef node_def; Status status = builder.Finalize(&node_def); @@ -90,13 +90,15 @@ class NodeDefBuilderTest : public ::testing::Test { // Calls Finalize() and verifies it returns an error. // Message must appear as a substring of the error. - void ExpectFailure(const NodeDefBuilder& builder, const string& message) { + void ExpectFailure(NodeDefBuilder& builder, // NOLINT + const string& message) { ExpectFailures(builder, {message}); } // Like ExpectFailure(), except that the error can come from // ValidateNodeDef(). - void ExpectInvalid(const NodeDefBuilder& builder, const string& message) { + void ExpectInvalid(NodeDefBuilder& builder, // NOLINT + const string& message) { NodeDef node_def; Status status = builder.Finalize(&node_def); if (status.ok()) { diff --git a/tensorflow/core/framework/node_def_util_test.cc b/tensorflow/core/framework/node_def_util_test.cc index 4c4f0e2f37a..0817eb3a4e9 100644 --- a/tensorflow/core/framework/node_def_util_test.cc +++ b/tensorflow/core/framework/node_def_util_test.cc @@ -43,7 +43,7 @@ NodeDef ToNodeDef(const string& text) { return node_def; } -NodeDef ToNodeDef(const NodeDefBuilder& builder) { +NodeDef ToNodeDef(NodeDefBuilder&& builder) { NodeDef node_def; TF_EXPECT_OK(builder.Finalize(&node_def)); return node_def; @@ -244,14 +244,14 @@ TEST(NodeDefUtilTest, AnyIn) { TEST(NodeDefUtilTest, Device) { const OpDef op_def1 = ToOpDef(OpDefBuilder("None")); const NodeDef node_def1 = - ToNodeDef(NodeDefBuilder("d", &op_def1).Device("/cpu:17")); + ToNodeDef(std::move(NodeDefBuilder("d", &op_def1).Device("/cpu:17"))); ExpectSuccess(node_def1, op_def1); EXPECT_EQ("{{node d}} = None[_device=\"/cpu:17\"]()", SummarizeNodeDef(node_def1)); const OpDef op_def2 = ToOpDef(OpDefBuilder("WithAttr").Attr("v: int")); - const NodeDef node_def2 = - ToNodeDef(NodeDefBuilder("d", &op_def2).Attr("v", 7).Device("/cpu:5")); + const NodeDef node_def2 = ToNodeDef( + std::move(NodeDefBuilder("d", &op_def2).Attr("v", 7).Device("/cpu:5"))); ExpectSuccess(node_def2, op_def2); EXPECT_EQ("{{node d}} = WithAttr[v=7, _device=\"/cpu:5\"]()", SummarizeNodeDef(node_def2)); @@ -376,8 +376,8 @@ TEST(InputTypesForNode, Simple) { .Input("b: int32") .Output("c: string") .Output("d: bool")); - const NodeDef node_def = ToNodeDef( - NodeDefBuilder("simple", &op_def).Input(FakeInput()).Input(FakeInput())); + const NodeDef node_def = ToNodeDef(std::move( + NodeDefBuilder("simple", &op_def).Input(FakeInput()).Input(FakeInput()))); DataTypeVector types; EXPECT_TRUE(InputTypesForNode(node_def, op_def, &types).ok()); EXPECT_EQ(types[0], DT_FLOAT); @@ -397,8 +397,8 @@ TEST(OutputTypesForNode, Simple) { .Input("b: int32") .Output("c: string") .Output("d: bool")); - const NodeDef node_def = ToNodeDef( - NodeDefBuilder("simple", &op_def).Input(FakeInput()).Input(FakeInput())); + const NodeDef node_def = ToNodeDef(std::move( + NodeDefBuilder("simple", &op_def).Input(FakeInput()).Input(FakeInput()))); DataTypeVector types; EXPECT_TRUE(OutputTypesForNode(node_def, op_def, &types).ok()); EXPECT_EQ(types[0], DT_STRING); @@ -418,8 +418,10 @@ TEST(OutputTypesForNode_AttrSliceOverload, Simple) { .Input("b: int32") .Output("c: string") .Output("d: bool")); - const AttrSlice attr_slice = AttrSlice(ToNodeDef( - NodeDefBuilder("simple", &op_def).Input(FakeInput()).Input(FakeInput()))); + const AttrSlice attr_slice = + AttrSlice(ToNodeDef(std::move(NodeDefBuilder("simple", &op_def) + .Input(FakeInput()) + .Input(FakeInput())))); DataTypeVector types; EXPECT_TRUE(OutputTypesForNode(attr_slice, op_def, &types).ok()); EXPECT_EQ(types[0], DT_STRING); @@ -433,8 +435,8 @@ TEST(NameRangesForNodeTest, Simple) { .Output("c: string") .Output("d: bool")); NameRangeMap inputs, outputs; - const NodeDef node_def = ToNodeDef( - NodeDefBuilder("simple", &op_def).Input(FakeInput()).Input(FakeInput())); + const NodeDef node_def = ToNodeDef(std::move( + NodeDefBuilder("simple", &op_def).Input(FakeInput()).Input(FakeInput()))); TF_EXPECT_OK(NameRangesForNode(node_def, op_def, &inputs, &outputs)); EXPECT_EQ(NameRangeMap({{"a", {0, 1}}, {"b", {1, 2}}}), inputs); EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 2}}}), outputs); @@ -453,18 +455,20 @@ TEST(NameRangesForNodeTest, Polymorphic) { .Output("c: T") .Attr("T: type")); NameRangeMap inputs, outputs; - const NodeDef node_def1 = ToNodeDef(NodeDefBuilder("poly", &op_def) - .Input(FakeInput(DT_INT32)) - .Input(FakeInput(DT_INT32))); + const NodeDef node_def1 = + ToNodeDef(std::move(NodeDefBuilder("poly", &op_def) + .Input(FakeInput(DT_INT32)) + .Input(FakeInput(DT_INT32)))); TF_EXPECT_OK(NameRangesForNode(node_def1, op_def, &inputs, &outputs)); EXPECT_EQ(NameRangeMap({{"a", {0, 1}}, {"b", {1, 2}}}), inputs); EXPECT_EQ(NameRangeMap({{"c", {0, 1}}}), outputs); EXPECT_EQ("{{node poly}} = Polymorphic[T=DT_INT32](a, b)", SummarizeNodeDef(node_def1)); - const NodeDef node_def2 = ToNodeDef(NodeDefBuilder("poly", &op_def) - .Input(FakeInput(DT_BOOL)) - .Input(FakeInput(DT_BOOL))); + const NodeDef node_def2 = + ToNodeDef(std::move(NodeDefBuilder("poly", &op_def) + .Input(FakeInput(DT_BOOL)) + .Input(FakeInput(DT_BOOL)))); TF_EXPECT_OK(NameRangesForNode(node_def2, op_def, &inputs, &outputs)); EXPECT_EQ(NameRangeMap({{"a", {0, 1}}, {"b", {1, 2}}}), inputs); EXPECT_EQ(NameRangeMap({{"c", {0, 1}}}), outputs); @@ -483,10 +487,11 @@ TEST(NameRangesForNodeTest, NRepeats) { .Attr("M: int") .Attr("T: type")); NameRangeMap inputs, outputs; - const NodeDef node_def1 = ToNodeDef(NodeDefBuilder("nr", &op_def) - .Input(FakeInput(4, DT_INT32)) - .Input(FakeInput(4, DT_FLOAT)) - .Attr("M", 3)); + const NodeDef node_def1 = + ToNodeDef(std::move(NodeDefBuilder("nr", &op_def) + .Input(FakeInput(4, DT_INT32)) + .Input(FakeInput(4, DT_FLOAT)) + .Attr("M", 3))); TF_EXPECT_OK(NameRangesForNode(node_def1, op_def, &inputs, &outputs)); EXPECT_EQ(NameRangeMap({{"a", {0, 4}}, {"b", {4, 8}}}), inputs); EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 5}}, {"e", {5, 8}}}), @@ -496,10 +501,11 @@ TEST(NameRangesForNodeTest, NRepeats) { "b:2, b:3)", SummarizeNodeDef(node_def1)); - const NodeDef node_def2 = ToNodeDef(NodeDefBuilder("nr", &op_def) - .Input(FakeInput(2, DT_INT32)) - .Input(FakeInput(2, DT_DOUBLE)) - .Attr("M", 7)); + const NodeDef node_def2 = + ToNodeDef(std::move(NodeDefBuilder("nr", &op_def) + .Input(FakeInput(2, DT_INT32)) + .Input(FakeInput(2, DT_DOUBLE)) + .Attr("M", 7))); TF_EXPECT_OK(NameRangesForNode(node_def2, op_def, &inputs, &outputs)); EXPECT_EQ(NameRangeMap({{"a", {0, 2}}, {"b", {2, 4}}}), inputs); EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 3}}, {"e", {3, 10}}}), @@ -524,10 +530,10 @@ TEST(NameRangesForNodeTest, TypeList) { .Attr("T3: list(type)")); NameRangeMap inputs, outputs; const NodeDef node_def1 = - ToNodeDef(NodeDefBuilder("tl", &op_def) - .Input(FakeInput({DT_BOOL, DT_FLOAT})) - .Input(FakeInput(4, DT_FLOAT)) - .Attr("T3", {DT_INT32, DT_DOUBLE, DT_STRING})); + ToNodeDef(std::move(NodeDefBuilder("tl", &op_def) + .Input(FakeInput({DT_BOOL, DT_FLOAT})) + .Input(FakeInput(4, DT_FLOAT)) + .Attr("T3", {DT_INT32, DT_DOUBLE, DT_STRING}))); TF_EXPECT_OK(NameRangesForNode(node_def1, op_def, &inputs, &outputs)); EXPECT_EQ(NameRangeMap({{"a", {0, 2}}, {"b", {2, 6}}}), inputs); EXPECT_EQ(NameRangeMap({{"c", {0, 4}}, {"d", {4, 7}}, {"e", {7, 9}}}), @@ -538,10 +544,11 @@ TEST(NameRangesForNodeTest, TypeList) { " T3=[DT_INT32, DT_DOUBLE, DT_STRING]](a, a:1, b, b:1, b:2, b:3)", SummarizeNodeDef(node_def1)); - const NodeDef node_def2 = ToNodeDef(NodeDefBuilder("tl", &op_def) - .Input(FakeInput(7, DT_INT32)) - .Input(FakeInput({DT_DOUBLE})) - .Attr("T3", {DT_DOUBLE, DT_STRING})); + const NodeDef node_def2 = + ToNodeDef(std::move(NodeDefBuilder("tl", &op_def) + .Input(FakeInput(7, DT_INT32)) + .Input(FakeInput({DT_DOUBLE})) + .Attr("T3", {DT_DOUBLE, DT_STRING}))); TF_EXPECT_OK(NameRangesForNode(node_def2, op_def, &inputs, &outputs)); EXPECT_EQ(NameRangeMap({{"a", {0, 7}}, {"b", {7, 8}}}), inputs); EXPECT_EQ(NameRangeMap({{"c", {0, 1}}, {"d", {1, 3}}, {"e", {3, 10}}}), diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc index a13769b3315..1c906a3599c 100644 --- a/tensorflow/core/graph/graph_partition.cc +++ b/tensorflow/core/graph/graph_partition.cc @@ -227,7 +227,7 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info, } NodeDef* cast = gdef->add_node(); - *status = cast_builder.Finalize(cast); + *status = cast_builder.Finalize(cast, /*consume=*/true); if (!status->ok()) return nullptr; // Connect the Send op to the cast. @@ -244,7 +244,7 @@ NodeDef* AddSend(const PartitionOptions& opts, const GraphInfo& g_info, send_builder.Attr("_start_time", start_time); } NodeDef* send = gdef->add_node(); - *status = send_builder.Finalize(send); + *status = send_builder.Finalize(send, /*consume=*/true); return send; } @@ -301,7 +301,7 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info, recv_builder.Device(dst->assigned_device_name()) .Attr("tensor_type", cast_dtype); NodeDef* recv = gdef->add_node(); - *status = recv_builder.Finalize(recv); + *status = recv_builder.Finalize(recv, /*consume=*/true); if (!status->ok()) return nullptr; *real_recv = recv; @@ -314,7 +314,7 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info, cast_builder.Device(dst->assigned_device_name()) .Input(recv->name(), 0, cast_dtype); NodeDef* cast = gdef->add_node(); - *status = cast_builder.Finalize(cast); + *status = cast_builder.Finalize(cast, /*consume=*/true); if (!status->ok()) return nullptr; return cast; } else if (edge->IsControlEdge()) { @@ -324,7 +324,7 @@ NodeDef* AddRecv(const PartitionOptions& opts, const GraphInfo& g_info, id_builder.Device(dst->assigned_device_name()) .Input(recv->name(), 0, cast_dtype); NodeDef* id = gdef->add_node(); - *status = id_builder.Finalize(id); + *status = id_builder.Finalize(id, /*consume=*/true); if (!status->ok()) return nullptr; return id; } else { @@ -341,7 +341,7 @@ NodeDef* AddDummyConst(const PartitionOptions& opts, GraphDef* gdef, .Device(src->assigned_device_name()) .Attr("dtype", DT_FLOAT) .Attr("value", tensor) - .Finalize(result); + .Finalize(result, /*consume=*/true); return result; } @@ -354,7 +354,7 @@ NodeDef* AddControlTrigger(const PartitionOptions& opts, GraphDef* gdef, "ControlTrigger") .Device(assigned_device_name) .Attr("_start_time", starttime) - .Finalize(result); + .Finalize(result, /*consume=*/true); return result; } @@ -424,7 +424,7 @@ Node* AddControlEnter(Graph* g, const string& node_name, node_builder.Attr("frame_name", frame_name); node_builder.Attr("parallel_iterations", parallel_iterations); Node* res_node; - *status = node_builder.Finalize(g, &res_node); + *status = node_builder.Finalize(g, &res_node, /*consume=*/true); if (!status->ok()) return nullptr; res_node->set_assigned_device_name(device_name); return res_node; @@ -437,7 +437,7 @@ Node* AddControlMerge(const string& in_name1, const string& in_name2, Graph* g, NodeBuilder node_builder(node_name, "Merge", g->op_registry()); node_builder.Input({{in_name1, 0, DT_FLOAT}, {in_name2, 0, DT_FLOAT}}); Node* res_node; - *status = node_builder.Finalize(g, &res_node); + *status = node_builder.Finalize(g, &res_node, /*consume=*/true); if (!status->ok()) return nullptr; res_node->set_assigned_device_name(device_name); return res_node; diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc index 6ce4531c5bc..07bf49f7f63 100644 --- a/tensorflow/core/graph/node_builder.cc +++ b/tensorflow/core/graph/node_builder.cc @@ -112,7 +112,7 @@ NodeBuilder& NodeBuilder::XlaCluster(StringPiece xla_cluster) { return *this; } -Status NodeBuilder::Finalize(Graph* graph, Node** created_node) const { +Status NodeBuilder::Finalize(Graph* graph, Node** created_node, bool consume) { // In case of error, set *created_node to nullptr. if (created_node != nullptr) *created_node = nullptr; if (!errors_.empty()) { @@ -120,7 +120,7 @@ Status NodeBuilder::Finalize(Graph* graph, Node** created_node) const { } NodeDef node_def; - TF_RETURN_IF_ERROR(def_builder_.Finalize(&node_def)); + TF_RETURN_IF_ERROR(def_builder_.Finalize(&node_def, consume)); TF_RETURN_IF_ERROR(ValidateNodeDef(node_def, def_builder_.op_def())); TF_RETURN_IF_ERROR( CheckOpDeprecation(def_builder_.op_def(), graph->versions().producer())); diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h index 51e044cd8b2..ce4fb4f3c48 100644 --- a/tensorflow/core/graph/node_builder.h +++ b/tensorflow/core/graph/node_builder.h @@ -121,7 +121,9 @@ class NodeBuilder { // Validates the described node and adds it to *graph, adding edges // for all (non-back) inputs. If created_node is not nullptr, // *created_node will be set to the new node (or nullptr on error). - Status Finalize(Graph* graph, Node** created_node) const; + // If `consume` is true, the builder state will be moved into `node_def`, + // and the builder will be left in an undefined state. + Status Finalize(Graph* graph, Node** created_node, bool consume = false); // Accessors for the values set in the constructor. const string& node_name() const { return def_builder_.node_name(); } diff --git a/tensorflow/core/graph/subgraph.cc b/tensorflow/core/graph/subgraph.cc index 7d839723f89..e70427f9ef8 100644 --- a/tensorflow/core/graph/subgraph.cc +++ b/tensorflow/core/graph/subgraph.cc @@ -229,7 +229,7 @@ Status ArgFeedRewrite::AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor, "_Arg") .Attr("T", BaseType(feed_tensor.node->output_type(feed_tensor.index))) .Attr("index", arg_index_) - .Finalize(g, out_node)); + .Finalize(g, out_node, /*consume=*/true)); (*out_node)->set_assigned_device_name(device_info().name()); return Status::OK(); } @@ -248,7 +248,7 @@ Status RecvFeedRewrite::AddNode(Graph* g, NodeBuilder::NodeOut feed_tensor, .Attr("send_device_incarnation", static_cast(device_info().incarnation())) .Attr("client_terminated", true) - .Finalize(g, out_node)); + .Finalize(g, out_node, /*consume=*/true)); (*out_node)->set_assigned_device_name(device_info().name()); return Status::OK(); @@ -268,7 +268,7 @@ Status RetvalFetchRewrite::AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor, .Attr("T", BaseType(fetch_tensor.node->output_type(fetch_tensor.index))) .Attr("index", retval_index_) - .Finalize(g, out_node)); + .Finalize(g, out_node, /*consume=*/true)); (*out_node)->set_assigned_device_name(device_info().name()); return Status::OK(); } @@ -286,7 +286,7 @@ Status SendFetchRewrite::AddNode(Graph* g, NodeBuilder::NodeOut fetch_tensor, .Attr("send_device_incarnation", static_cast(device_info().incarnation())) .Attr("client_terminated", true) - .Finalize(g, out_node)); + .Finalize(g, out_node, /*consume=*/true)); (*out_node)->set_assigned_device_name(device_info().name()); return Status::OK(); } From 144bfee21ad830bcbdd1bc8f138684cca0e3234f Mon Sep 17 00:00:00 2001 From: Yuefeng Zhou Date: Fri, 19 Jul 2019 22:44:02 -0700 Subject: [PATCH 0228/3053] Use regroup to wrap Mirrored values in cross_device_ops. PiperOrigin-RevId: 259091404 --- .../python/distribute/cross_device_ops.py | 64 +++++++---- .../distribute/cross_device_ops_test.py | 101 +++++++++++------- .../python/distribute/cross_device_utils.py | 2 +- .../distribute/mirrored_variable_test.py | 5 + .../python/distribute/moving_averages_test.py | 1 + 5 files changed, 111 insertions(+), 62 deletions(-) diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py index 143b183e76b..1932a5a29ee 100644 --- a/tensorflow/python/distribute/cross_device_ops.py +++ b/tensorflow/python/distribute/cross_device_ops.py @@ -48,21 +48,26 @@ def check_destinations(destinations): Boolean which is True if `destinations` is not empty. """ # Calling bool() on a ResourceVariable is not allowed. - if isinstance(destinations, resource_variable_ops.BaseResourceVariable): + if isinstance(destinations, + (resource_variable_ops.BaseResourceVariable, ops.Tensor)): return bool(destinations.device) return bool(destinations) def validate_destinations(destinations): - if not isinstance(destinations, - (value_lib.DistributedValues, - resource_variable_ops.BaseResourceVariable, - value_lib.AggregatingVariable, - six.string_types, - value_lib.TPUMirroredVariable, - # LogicalDeviceSpec is only used internally, e.g. as a - # broadcast destination, never supplied by a user. - value_lib.LogicalDeviceSpec)): + """Validates the `destination` is one of expected types.""" + if not isinstance( + destinations, + ( + value_lib.DistributedValues, + resource_variable_ops.BaseResourceVariable, + ops.Tensor, + value_lib.AggregatingVariable, + six.string_types, + value_lib.TPUMirroredVariable, + # LogicalDeviceSpec is only used internally, e.g. as a + # broadcast destination, never supplied by a user. + value_lib.LogicalDeviceSpec)): raise ValueError("destinations must be one of a `DistributedValues` object," " a tf.Variable object, or a device string.") @@ -159,7 +164,7 @@ def get_devices_from(destinations): destinations.logical_device) elif isinstance(destinations, six.string_types): return (device_util.resolve(destinations),) - return (destinations.device,) + return (device_util.resolve(destinations.device),) def get_device_map_from(destinations): @@ -199,7 +204,8 @@ def simple_broadcast(value, destinations, always_mirrored=False): value_updates.append( cross_device_utils.copy_tensor_or_indexed_slices_to_device( value, d)) - return value_lib.Mirrored(device_map, value_updates, logical_device) + return value_lib.regroup( + device_map, value_updates, wrap_class=value_lib.Mirrored) def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn, @@ -265,8 +271,10 @@ class CrossDeviceOps(object): if self._num_between_graph_workers == 1 and len( per_replica_value.values) == 1 and _devices_match( per_replica_value, destinations): - return value_lib.Mirrored(per_replica_value.device_map, - per_replica_value.values) + return value_lib.regroup( + per_replica_value.device_map, + per_replica_value.values, + wrap_class=value_lib.Mirrored) return self.reduce_implementation(reduce_op, per_replica_value, destinations) @@ -306,7 +314,8 @@ class CrossDeviceOps(object): value_destination_pairs) and len( value_destination_pairs[0][0].values) == 1: return [ - value_lib.Mirrored(v.device_map, v.values) + value_lib.regroup( + v.device_map, v.values, wrap_class=value_lib.Mirrored) for v, _ in value_destination_pairs ] @@ -475,16 +484,20 @@ def _ungroup_and_make_mirrored(grouped_reduced, Returns: a list of Mirrored objects. """ - device_map, logical_device = get_device_map_from(destinations) + device_map, _ = get_device_map_from(destinations) num_replicas = device_map.num_replicas_in_graph * num_between_graph_workers index = [[] for _ in range(len(grouped_reduced[0]))] for per_replica_reduced in grouped_reduced: for i, (v, _) in enumerate(per_replica_reduced): if reduce_op == reduce_util.ReduceOp.MEAN: - index[i].append(v / num_replicas) + with ops.device(v.device): + index[i].append(v / num_replicas) else: index[i].append(v) - return [value_lib.Mirrored(device_map, v, logical_device) for v in index] + return [ + value_lib.regroup(device_map, v, wrap_class=value_lib.Mirrored) + for v in index + ] class _ConcatAndSplitPacker(object): @@ -1009,10 +1022,19 @@ class CollectiveAllReduce(CrossDeviceOps): def reduce_implementation(self, reduce_op, per_replica_value, destinations): all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0] device_map, logical_device = get_device_map_from(destinations) - if (all_reduced.device_map is device_map and + devices = device_map.logical_to_actual_devices(logical_device) + + if (isinstance(all_reduced, value_lib.Mirrored) and + all_reduced.device_map is device_map and all_reduced.logical_device == logical_device): return all_reduced - devices = device_map.logical_to_actual_devices(logical_device) + + # Convert `all_reduced` to a `Mirrored` object, as a simple and uniform + # utility to access component for a particular device. + if not isinstance(all_reduced, value_lib.Mirrored): + all_reduced = value_lib.Mirrored( + value_lib.SingleDeviceMap(all_reduced.device), [all_reduced]) + index = [] with ops.control_dependencies(all_reduced.values): for d in devices: @@ -1024,7 +1046,7 @@ class CollectiveAllReduce(CrossDeviceOps): # copy from the corresponding replica instead of the primary. index.append(array_ops.identity(all_reduced.primary)) - return value_lib.Mirrored(device_map, index, logical_device) + return value_lib.regroup(device_map, index, wrap_class=value_lib.Mirrored) def batch_reduce_implementation(self, reduce_op, value_destination_pairs): all_devices_match = _all_devices_match(value_destination_pairs) diff --git a/tensorflow/python/distribute/cross_device_ops_test.py b/tensorflow/python/distribute/cross_device_ops_test.py index ea2241d8616..af9a258249a 100644 --- a/tensorflow/python/distribute/cross_device_ops_test.py +++ b/tensorflow/python/distribute/cross_device_ops_test.py @@ -32,9 +32,9 @@ from tensorflow.python.distribute import multi_worker_test_base from tensorflow.python.distribute import reduce_util from tensorflow.python.distribute import values as value_lib from tensorflow.python.eager import context -from tensorflow.python.framework import kernels from tensorflow.python.eager import test from tensorflow.python.framework import constant_op +from tensorflow.python.framework import kernels from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops @@ -45,6 +45,8 @@ def _get_devices(devices): return tuple(device_util.resolve(d) for d in devices) elif isinstance(devices, value_lib.DistributedValues): return devices.devices + elif isinstance(devices, ops.Tensor): + return (device_util.resolve(devices.device),) return (device_util.resolve(devices),) @@ -64,7 +66,7 @@ def _make_per_replica(values, devices, regroup=False): with ops.device(d): placed_v = array_ops.identity(v) index.append(placed_v) - return value_lib.PerReplica(value_lib.ReplicaDeviceMap(devices), index) + return value_lib.regroup(value_lib.ReplicaDeviceMap(devices), index) # pylint: disable=g-doc-args,g-doc-return-or-yield @@ -75,8 +77,14 @@ def _fake_mirrored(value, devices): true in reality. """ devices = _get_devices(devices) - return value_lib.Mirrored(value_lib.ReplicaDeviceMap(devices), - [value] * len(devices)) + values = [] + for d in devices: + with ops.device(d): + values.append(array_ops.identity(value)) + return value_lib.regroup( + value_lib.ReplicaDeviceMap(devices), + values, + wrap_class=value_lib.Mirrored) def _make_indexed_slices(values, indices, dense_shape, device): @@ -91,7 +99,10 @@ def _make_indexed_slices(values, indices, dense_shape, device): def _make_mirrored_indexed_slices(devices, values, indices, dense_shape): values = [_make_indexed_slices(values, indices, dense_shape, d) for d in devices] - return value_lib.Mirrored(value_lib.ReplicaDeviceMap(devices), values) + return value_lib.regroup( + value_lib.ReplicaDeviceMap(devices), + values, + wrap_class=value_lib.Mirrored) _cpu_device = "/device:CPU:0" @@ -109,22 +120,25 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase): self.evaluate(ops.convert_to_tensor(right))) def _assert_values_equal(self, left, right): - if isinstance(left, list): + self.assertEqual(type(left), type(right)) + if isinstance(left, (list, tuple)): for l, r in zip(left, right): self._assert_values_equal(l, r) else: - self.assertEqual(type(left), type(right)) - self.assertEqual(set(left.devices), set(right.devices)) - if isinstance(left.values[0], ops.IndexedSlices): - for d in left.devices: - self._assert_indexed_slices_equal(left.get(d), right.get(d)) - elif context.executing_eagerly(): - self.assertEqual([v.numpy() for v in left.values], - list(right.values)) + if isinstance(left, value_lib.DistributedValues): + self.assertEqual(set(left.devices), set(right.devices)) + self._assert_values_equal([left.get(d) for d in sorted(left.devices)], + [right.get(d) for d in sorted(right.devices)]) else: - with self.cached_session() as sess: - self.assertEqual( - sess.run(list(left.values)), list(right.values)) + self.assertEqual( + device_util.resolve(left.device), device_util.resolve(right.device)) + if isinstance(left, ops.IndexedSlices): + self._assert_indexed_slices_equal(left, right) + elif context.executing_eagerly(): + self.assertEqual(left.numpy(), right.numpy()) + else: + with self.cached_session() as sess: + self.assertEqual(sess.run(left), sess.run(right)) def _testReductionAndBroadcast(self, cross_device_ops, devices): if context.num_gpus() < sum(1 for d in devices if "GPU" in d.upper()): @@ -139,8 +153,8 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase): mean_2 = mean + 1. destination_mirrored = _fake_mirrored(1., devices) - destination_different = _fake_mirrored(1., _cpu_device) - destination_str = _cpu_device + destination_different = _fake_mirrored(1., device_util.resolve(_cpu_device)) + destination_str = device_util.resolve(_cpu_device) all_destinations = [ destination_mirrored, destination_different, destination_str, @@ -416,7 +430,9 @@ class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase, @combinations.generate(multi_worker_allreduce_combinations) def testReductionAndBroadcast(self, cross_device_ops, devices): - self._testReductionAndBroadcast(cross_device_ops, devices) + # Mimic the default device of multi-worker strategies. + with ops.device("/job:worker/replica:0/task:0"): + self._testReductionAndBroadcast(cross_device_ops, devices) NUM_WORKERS = 3 @@ -493,22 +509,27 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase, "grpc://" + self._cluster_spec[task_type][task_id]) def _assert_values_equal(self, left, right, sess): - if isinstance(left, list): + self.assertEqual(type(left), type(right)) + if isinstance(left, (list, tuple)): for l, r in zip(left, right): self._assert_values_equal(l, r, sess) else: - self.assertEqual(type(left), type(right)) - self.assertEqual(set(left.devices), set(right.devices)) - - run_options = config_pb2.RunOptions() - run_options.experimental.collective_graph_key = 6 - - left_values = np.array( - sess.run(list(left.values), options=run_options)).flatten() - right_values = np.array(list(right.values)).flatten() - self.assertEqual(len(left_values), len(right_values)) - for l, r in zip(left_values, right_values): - self.assertEqual(l, r) + if isinstance(left, value_lib.DistributedValues): + self.assertEqual(set(left.devices), set(right.devices)) + self._assert_values_equal(left.values, right.values, sess) + else: + self.assertEqual( + device_util.resolve(left.device), device_util.resolve(right.device)) + if isinstance(left, ops.IndexedSlices): + self._assert_indexed_slices_equal(left, right) + elif context.executing_eagerly(): + self.assertEqual(left.numpy(), right.numpy()) + else: + run_options = config_pb2.RunOptions() + run_options.experimental.collective_graph_key = 6 + self.assertEqual( + sess.run(left, options=run_options), + sess.run(right, options=run_options)) def _test_reduction(self, task_type, @@ -533,10 +554,6 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase, def _reduce(test_object, reduce_op, per_replica, destinations): if use_strategy_object: with test_object.scope(): - # Mimic the behavior that distribution strategy usually strips the - # wrapper if there is only one value. - if len(per_replica.values) == 1: - per_replica = per_replica.values[0] return test_object.extended.reduce_to(reduce_op, per_replica, destinations) else: @@ -663,12 +680,16 @@ class CollectiveAllReduceTest(multi_worker_test_base.MultiWorkerTestBase, else: result = collective_all_reduce.reduce(reduce_util.ReduceOp.SUM, per_replica, per_replica) - self.assertIsInstance(result, value_lib.Mirrored) + if num_gpus > 1: + self.assertIsInstance(result, value_lib.Mirrored) run_options = config_pb2.RunOptions() run_options.experimental.collective_graph_key = 7 - result = sess.run([ops.convert_to_tensor(v) for v in result.values], - options=run_options)[0] + if num_gpus > 1: + result = sess.run([ops.convert_to_tensor(v) for v in result.values], + options=run_options)[0] + else: + result = sess.run(ops.convert_to_tensor(result), options=run_options) # Reduce the same indexed slices on CPU locally as our expected results. devices_cpu = [(worker_device or "") + "/device:CPU:0"] * ( diff --git a/tensorflow/python/distribute/cross_device_utils.py b/tensorflow/python/distribute/cross_device_utils.py index 6058db356e2..6ef06b91799 100644 --- a/tensorflow/python/distribute/cross_device_utils.py +++ b/tensorflow/python/distribute/cross_device_utils.py @@ -576,7 +576,7 @@ def unpack_grad_tuple(gv, gpt): reduction. """ elt_widths = [x.num_elements() for x in gpt.shapes] - with ops.device(gv[0][0].device): + with ops.device(gv[0].device): with ops.name_scope('unpack'): splits = array_ops.split(gv[0], elt_widths) unpacked_gv = [] diff --git a/tensorflow/python/distribute/mirrored_variable_test.py b/tensorflow/python/distribute/mirrored_variable_test.py index 1bf995b881a..a5e682f09c3 100644 --- a/tensorflow/python/distribute/mirrored_variable_test.py +++ b/tensorflow/python/distribute/mirrored_variable_test.py @@ -454,6 +454,9 @@ class MirroredVariableCreationTest(test.TestCase): _ = distribution.extended.call_for_each_replica(model_fn, args=(names,)) def testSyncOnReadVariable(self, distribution): + if context.executing_eagerly(): + self.skipTest("Skip the test due to b/137400477.") + all_v_sum = {} all_v_mean = {} components_sum = {} @@ -554,6 +557,8 @@ class MirroredVariableCreationTest(test.TestCase): self.assertStartsWith(v1._op.name, "replica_1/") def testSyncOnReadVariableUpdate(self, distribution): + if context.executing_eagerly(): + self.skipTest("Skip the test due to b/137400477.") def model_fn(): v_sum = variable_scope.variable( diff --git a/tensorflow/python/distribute/moving_averages_test.py b/tensorflow/python/distribute/moving_averages_test.py index 97626ed3697..50dee774aa5 100644 --- a/tensorflow/python/distribute/moving_averages_test.py +++ b/tensorflow/python/distribute/moving_averages_test.py @@ -35,6 +35,7 @@ all_combinations = combinations.combine( strategy_combinations.default_strategy, strategy_combinations.one_device_strategy, strategy_combinations.mirrored_strategy_with_gpu_and_cpu, + strategy_combinations.central_storage_strategy_with_gpu_and_cpu, strategy_combinations.tpu_strategy, ], mode=["graph"]) From 8628b0b768c0bbb85c8d78db4922c81b21991cc5 Mon Sep 17 00:00:00 2001 From: Dero Gharibian Date: Sat, 20 Jul 2019 00:15:55 -0700 Subject: [PATCH 0229/3053] Added support for linking against _pywrap_tensorflow_internal.so in tf_pybind_extension_opensource targets. PiperOrigin-RevId: 259096934 --- tensorflow/tensorflow.bzl | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index d253d5b8799..eaa73eb30af 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -419,6 +419,13 @@ def tf_binary_additional_data_deps(): ], ) +def tf_binary_pybind_deps(): + return select({ + clean_dep("//tensorflow:macos"): [clean_dep("//tensorflow/python:lib_pywrap_tensorflow_internal.dylib")], + clean_dep("//tensorflow:windows"): [clean_dep("//tensorflow/python:_pywrap_tensorflow_internal.dll")], + "//conditions:default": [clean_dep("//tensorflow/python:lib_pywrap_tensorflow_internal.so")], + }) + # Helper function for the per-OS tensorflow libraries and their version symlinks def tf_shared_library_deps(): return select({ @@ -1895,7 +1902,11 @@ def tf_py_wrap_cc( # Convert a rule name such as foo/bar/baz to foo/bar/_baz.so # and use that as the name for the rule producing the .so file. - cc_library_name = "/".join(name.split("/")[:-1] + ["_" + module_name + ".so"]) + cc_library_base = "/".join(name.split("/")[:-1] + ["_" + module_name]) + + # TODO(b/137885063): tf_cc_shared_object needs to be cleaned up; we really + # shouldn't be passing a name qualified with .so here. + cc_library_name = cc_library_base + ".so" cc_library_pyd_name = "/".join( name.split("/")[:-1] + ["_" + module_name + ".pyd"], ) @@ -1957,6 +1968,25 @@ def tf_py_wrap_cc( deps = deps + extra_deps, **kwargs ) + + # When a non-versioned .so is added as a 'src' to a bazel target, it uses + # -l%(so_name) instead of -l:%(so_file) during linking. When -l%(so_name) + # is passed to ld, it will look for an associated file with the schema + # lib%(so_name).so. Since pywrap_tensorflow is not explicitly versioned + # and is not prefixed with lib_, we add a rule for the creation of an .so + # file with the canonical lib schema (e.g. libNAME.so), so that + # -l%(so_name) is resolved during linking. + # + # See: https://github.com/bazelbuild/bazel/blob/7a6808260a733d50983c1adf0cf5a7493472267f/src/main/java/com/google/devtools/build/lib/rules/cpp/LibrariesToLinkCollector.java#L319 + for pattern in SHARED_LIBRARY_NAME_PATTERNS: + name_os = pattern % (cc_library_base, "") + native.genrule( + name = name_os + "_rule", + srcs = [":" + cc_library_name], + outs = [name_os], + cmd = "cp $< $@", + ) + native.genrule( name = "gen_" + cc_library_pyd_name, srcs = [":" + cc_library_name], @@ -2401,11 +2431,11 @@ def tf_pybind_extension( ) native.cc_binary( name = so_file, - srcs = srcs + hdrs, - data = data, + srcs = srcs + hdrs + tf_binary_additional_srcs() + tf_binary_pybind_deps(), + data = data + tf_binary_pybind_deps(), copts = copts, nocopts = nocopts, - linkopts = linkopts + select({ + linkopts = linkopts + _rpath_linkopts(name) + select({ "@local_config_cuda//cuda:darwin": [ "-Wl,-exported_symbols_list,$(location %s)" % exported_symbols_file, ], From f9a47e1074f3fd8f38fc288dcfb1bd880f81cb21 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sat, 20 Jul 2019 02:02:19 -0700 Subject: [PATCH 0230/3053] compat: Update forward compatibility horizon to 2019-07-20 PiperOrigin-RevId: 259104270 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index dd2ed951c8f..330066fc91b 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export # This value changes every day with an automatic CL. It can be modified in code # via `forward_compatibility_horizon()` or with the environment variable # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date. -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 7, 19) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 7, 20) _FORWARD_COMPATIBILITY_HORIZON_OVERRIDDEN = False _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS" From 3021fab5aa42ae536a31dbe7b61071f6e171eeb8 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sat, 20 Jul 2019 02:02:29 -0700 Subject: [PATCH 0231/3053] Update GraphDef version to 102. PiperOrigin-RevId: 259104295 --- tensorflow/core/public/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h index 0f98cd91fe3..ad5c3c56a84 100644 --- a/tensorflow/core/public/version.h +++ b/tensorflow/core/public/version.h @@ -108,7 +108,7 @@ limitations under the License. #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0 -#define TF_GRAPH_DEF_VERSION 101 // Updated: 2019/7/19 +#define TF_GRAPH_DEF_VERSION 102 // Updated: 2019/7/20 // Checkpoint compatibility versions (the versions field in SavedSliceMeta). // From 15dd890a1e9f95e0bb7219a9ae2f846fe47e520b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sat, 20 Jul 2019 03:03:45 -0700 Subject: [PATCH 0232/3053] Merge TypeUtilities library into the IR library The TypeUtilities.{cpp,h}, currently living in {lib,include/mlir}/Support, do not belong to the Support library. Instead, they form a separate utility library that depends on the IR library. The operations it provides relate to standard types (tensors, memrefs) as well as to operation manipulation, making them a better fit for the main IR library. PiperOrigin-RevId: 259108314 --- tensorflow/compiler/mlir/lite/BUILD | 2 -- tensorflow/compiler/mlir/lite/ir/tfl_ops.cc | 2 +- .../compiler/mlir/lite/transforms/lower_static_tensor_list.cc | 2 +- tensorflow/compiler/mlir/tensorflow/BUILD | 1 - tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc | 2 +- tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h | 2 +- .../tensorflow/transforms/functional_control_flow_to_cfg.cc | 2 +- tensorflow/compiler/mlir/xla/BUILD | 1 - tensorflow/compiler/mlir/xla/ir/xla_ops.cc | 2 +- 9 files changed, 6 insertions(+), 10 deletions(-) diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD index 7846716e9dd..8aa78a2a869 100644 --- a/tensorflow/compiler/mlir/lite/BUILD +++ b/tensorflow/compiler/mlir/lite/BUILD @@ -181,7 +181,6 @@ cc_library( "@local_config_mlir//:QuantOps", "@local_config_mlir//:StandardOps", "@local_config_mlir//:Support", - "@local_config_mlir//:TypeUtilities", ], alwayslink = 1, ) @@ -234,7 +233,6 @@ cc_library( "@local_config_mlir//:QuantOps", "@local_config_mlir//:StandardOps", "@local_config_mlir//:Support", - "@local_config_mlir//:TypeUtilities", ], alwayslink = 1, ) diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc index 6c91470da07..b79545353f6 100644 --- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc +++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc @@ -20,8 +20,8 @@ limitations under the License. #include "mlir/IR/OpImplementation.h" // TF:local_config_mlir #include "mlir/IR/PatternMatch.h" // TF:local_config_mlir #include "mlir/IR/StandardTypes.h" // TF:local_config_mlir +#include "mlir/IR/TypeUtilities.h" // TF:local_config_mlir #include "mlir/StandardOps/Ops.h" // TF:local_config_mlir -#include "mlir/Support/TypeUtilities.h" // TF:local_config_mlir #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h" namespace mlir { diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc index f8831ef08e8..ad54a3633e3 100644 --- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc +++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc @@ -35,6 +35,7 @@ limitations under the License. #include "mlir/IR/OperationSupport.h" // TF:local_config_mlir #include "mlir/IR/PatternMatch.h" // TF:local_config_mlir #include "mlir/IR/StandardTypes.h" // TF:local_config_mlir +#include "mlir/IR/TypeUtilities.h" // TF:local_config_mlir #include "mlir/IR/Types.h" // TF:local_config_mlir #include "mlir/IR/Value.h" // TF:local_config_mlir #include "mlir/Pass/Pass.h" // TF:local_config_mlir @@ -43,7 +44,6 @@ limitations under the License. #include "mlir/Support/Functional.h" // TF:local_config_mlir #include "mlir/Support/LLVM.h" // TF:local_config_mlir #include "mlir/Support/LogicalResult.h" // TF:local_config_mlir -#include "mlir/Support/TypeUtilities.h" // TF:local_config_mlir #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h" #include "tensorflow/compiler/mlir/lite/transforms/passes.h" #include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h" diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD index 9715a672660..f1adc29aa1b 100644 --- a/tensorflow/compiler/mlir/tensorflow/BUILD +++ b/tensorflow/compiler/mlir/tensorflow/BUILD @@ -131,7 +131,6 @@ cc_library( "@local_config_mlir//:StandardOps", "@local_config_mlir//:Support", "@local_config_mlir//:TransformUtils", - "@local_config_mlir//:TypeUtilities", ], # TODO(jpienaar): Merge in the dialect registration. alwayslink = 1, diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc index 3e62dd786ec..41e168b8827 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc @@ -29,13 +29,13 @@ limitations under the License. #include "mlir/IR/Matchers.h" // TF:local_config_mlir #include "mlir/IR/OpImplementation.h" // TF:local_config_mlir #include "mlir/IR/PatternMatch.h" // TF:local_config_mlir +#include "mlir/IR/TypeUtilities.h" // TF:local_config_mlir #include "mlir/IR/Types.h" // TF:local_config_mlir #include "mlir/IR/Value.h" // TF:local_config_mlir #include "mlir/Parser.h" // TF:local_config_mlir #include "mlir/StandardOps/Ops.h" // TF:local_config_mlir #include "mlir/Support/LLVM.h" // TF:local_config_mlir #include "mlir/Support/STLExtras.h" // TF:local_config_mlir -#include "mlir/Support/TypeUtilities.h" // TF:local_config_mlir #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h" #include "tensorflow/core/platform/logging.h" diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h index 7885a8e6199..fff2ffa9a0a 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h @@ -27,7 +27,7 @@ limitations under the License. #include "mlir/IR/Module.h" // TF:local_config_mlir #include "mlir/IR/OpDefinition.h" // TF:local_config_mlir #include "mlir/IR/StandardTypes.h" // TF:local_config_mlir -#include "mlir/Support/TypeUtilities.h" // TF:local_config_mlir +#include "mlir/IR/TypeUtilities.h" // TF:local_config_mlir #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h" namespace mlir { diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc index bc9ed1111df..9b7ccdb365d 100644 --- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc +++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc @@ -18,11 +18,11 @@ limitations under the License. #include "mlir/IR/Builders.h" // TF:local_config_mlir #include "mlir/IR/Operation.h" // TF:local_config_mlir +#include "mlir/IR/TypeUtilities.h" // TF:local_config_mlir #include "mlir/IR/Value.h" // TF:local_config_mlir #include "mlir/Pass/Pass.h" // TF:local_config_mlir #include "mlir/Pass/PassRegistry.h" // TF:local_config_mlir #include "mlir/StandardOps/Ops.h" // TF:local_config_mlir -#include "mlir/Support/TypeUtilities.h" // TF:local_config_mlir #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h" #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h" #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h" diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD index c36299ee263..fd1aa690fff 100644 --- a/tensorflow/compiler/mlir/xla/BUILD +++ b/tensorflow/compiler/mlir/xla/BUILD @@ -152,7 +152,6 @@ cc_library( "@local_config_mlir//:StandardOps", "@local_config_mlir//:Support", "@local_config_mlir//:TransformUtils", - "@local_config_mlir//:TypeUtilities", ], alwayslink = 1, ) diff --git a/tensorflow/compiler/mlir/xla/ir/xla_ops.cc b/tensorflow/compiler/mlir/xla/ir/xla_ops.cc index 25da9da3d1d..f47d4a022fb 100644 --- a/tensorflow/compiler/mlir/xla/ir/xla_ops.cc +++ b/tensorflow/compiler/mlir/xla/ir/xla_ops.cc @@ -21,7 +21,7 @@ limitations under the License. #include "mlir/IR/Attributes.h" // TF:local_config_mlir #include "mlir/IR/Builders.h" // TF:local_config_mlir #include "mlir/IR/OpImplementation.h" // TF:local_config_mlir -#include "mlir/Support/TypeUtilities.h" // TF:local_config_mlir +#include "mlir/IR/TypeUtilities.h" // TF:local_config_mlir using namespace mlir; using namespace mlir::XLA; From c5bc30ed9f41cfe18211bd72bbd80c3a8567764f Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Sat, 20 Jul 2019 10:33:10 -0700 Subject: [PATCH 0233/3053] Fix `special_math_ops._transpose_if_necessary()` for Python 3.x. We were comparing the list of permutation indices to `range(len(perm))`, to avoid unnecessary transposes. On Python 2.x, this is a list, which means the equality comparison has the desired effect. On Python 3.x it is a range iterator, and the equality check fails, creating unnecessary transposes for users of `tf.einsum()`. PiperOrigin-RevId: 259131715 --- tensorflow/python/ops/special_math_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py index 6b47f7e6347..7ad841c18a4 100644 --- a/tensorflow/python/ops/special_math_ops.py +++ b/tensorflow/python/ops/special_math_ops.py @@ -521,7 +521,7 @@ def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum): def _transpose_if_necessary(tensor, perm): """Like transpose(), but avoids creating a new tensor if possible.""" - if perm != range(len(perm)): + if perm != list(range(len(perm))): return array_ops.transpose(tensor, perm=perm) else: return tensor From 3e2958befaa22595b754018e7e2ef089420ff17d Mon Sep 17 00:00:00 2001 From: amoitra Date: Sat, 20 Jul 2019 11:05:22 -0700 Subject: [PATCH 0234/3053] Added Transpose and a reshape --- .../xla/service/gpu/cudnn_conv_rewriter.cc | 33 +++++++++++++++---- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc index 9e59b1290ed..066e2daf52d 100755 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc @@ -265,18 +265,37 @@ MatchBackwardFilter(HloInstruction* conv) { int64 input_feature_dimension = backward_conv_dnums.input_feature_dimension(); int64 input_batch = lhs->shape().dimensions(input_batch_dimension); + int64 input_feature = lhs->shape().dimensions(input_feature_dimension); + + // Reshape batch_dim G*N -> [G,N] + std::vector reshape_dims = lhs->shape().dimensions(); + auto num_groups = conv->feature_group_count(); // Ensure that input_batch is exact multiple of conv->feature_group_count() CHECK_EQ(input_batch % conv->feature_group_count(), 0) << "Input batch should be an exact multiple of feature group count"; - int64 input_feature = lhs->shape().dimensions(input_feature_dimension); - - Shape new_shape = lhs->shape(); - new_shape.set_dimensions(input_batch_dimension, - input_batch / conv->feature_group_count()); - new_shape.set_dimensions(input_feature_dimension, - input_feature * conv->feature_group_count()); + reshape_dims[input_batch_dimension] = + reshape_dims[input_batch_dimension] / num_groups; + reshape_dims.insert(reshape_dims.begin() + input_batch_dimension, num_groups); HloComputation* c = conv->parent(); + lhs = c->AddInstruction(HloInstruction::CreateReshape( + ShapeUtil::MakeShape(lhs->shape().element_type(), reshape_dims), lhs)); + + // Transpose G to the axis before C/G, For eg: [G, N, C/G, H, W] -> [N, G, + // C/G, H, W] + std::vector transpose_dims(lhs->shape().dimensions_size()); + std::iota(transpose_dims.begin(), transpose_dims.end(), 0); + transpose_dims.erase(transpose_dims.begin() + input_batch_dimension); + transpose_dims.insert(transpose_dims.begin() + input_feature_dimension, + input_batch_dimension); + lhs = c->AddInstruction( + HloInstruction::CreateTranspose(lhs->shape(), lhs, transpose_dims)); + + // Merge [G,C/G] -> [C] + Shape new_shape = lhs->shape(); + new_shape.DeleteDimension(input_feature_dimension); + new_shape.set_dimensions(input_feature_dimension, + input_feature * conv->feature_group_count()); lhs = c->AddInstruction(HloInstruction::CreateReshape(new_shape, lhs)); return std::make_tuple(true, backward_conv_window, backward_conv_dnums, lhs); } From 4258d145ba22fe82c5823ac317c8a584c26fd810 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Sat, 20 Jul 2019 11:13:36 -0700 Subject: [PATCH 0235/3053] Add executor-to-control dialect conversion pass This pass convert an MLIR representation of TensorFlow graph from a mix of tf_executor and tf dialects to TensorFlow Control Dialect (_tf). This is intended for managing the transition only, the TensorFlow Control dialect is ultimately intended to be removed after the GraphDef importer is updated to target directly the tf_executor dialect. PiperOrigin-RevId: 259133988 --- tensorflow/compiler/mlir/tensorflow/BUILD | 1 + .../mlir/tensorflow/ir/tf_executor_ops.td | 7 +- .../tests/executor_to_control_dialect.mlir | 87 ++++++++ .../translate/control_to_executor_dialect.cc | 2 +- .../translate/executor_to_control_dialect.cc | 204 ++++++++++++++++++ .../tensorflow/translate/tf_mlir_translate.cc | 23 ++ 6 files changed, 317 insertions(+), 7 deletions(-) create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/executor_to_control_dialect.mlir create mode 100644 tensorflow/compiler/mlir/tensorflow/translate/executor_to_control_dialect.cc diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD index f1adc29aa1b..d0968317055 100644 --- a/tensorflow/compiler/mlir/tensorflow/BUILD +++ b/tensorflow/compiler/mlir/tensorflow/BUILD @@ -105,6 +105,7 @@ cc_library( "transforms/optimize.cc", "transforms/raise_control_flow.cc", "translate/control_to_executor_dialect.cc", + "translate/executor_to_control_dialect.cc", ], hdrs = [ "ir/control_flow_ops.h", diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td index 125ef1bfda6..748416a8142 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td @@ -415,10 +415,6 @@ def TfExecutor_NextIterationSourceOp : TfExecutor_Op<"NextIteration.Source", [No Note: Additional result corresponds to the control output. }]; - let arguments = (ins - Variadic:$controlInputs - ); - let results = (outs AnyType:$output, // The NextIteration.Source operation returns an extra token consumed by the sink. @@ -428,12 +424,11 @@ def TfExecutor_NextIterationSourceOp : TfExecutor_Op<"NextIteration.Source", [No let builders = [OpBuilder< "Builder *builder, OperationState *result, Type result_type, " - "ArrayRef control_inputs = {}, ArrayRef attributes = {}", + "ArrayRef attributes = {}", [{ Type token_type = TokenType::get(builder->getContext()); Type control_type = ControlType::get(builder->getContext()); result->types = { result_type, token_type, control_type }; - result->operands.append(control_inputs.begin(), control_inputs.end()); result->attributes.append(attributes.begin(), attributes.end()); }]> ]; diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_to_control_dialect.mlir b/tensorflow/compiler/mlir/tensorflow/tests/executor_to_control_dialect.mlir new file mode 100644 index 00000000000..73446a84fee --- /dev/null +++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_to_control_dialect.mlir @@ -0,0 +1,87 @@ +// RUN: tf-opt -tf-executor-to-control-conversion %s | FileCheck %s --dump-input=fail + +// CHECK-LABEL: func @LoopTest() { +func @LoopTest() { + tf_executor.graph { + %0:2 = tf_executor.island { + %cst = "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Const", value = dense<1> : tensor} : () -> tensor + tf_executor.yield %cst : tensor + } + %1:2 = tf_executor.Enter %0#0 frame "while/while_context" : (tensor) -> (tensor<*xi32>, !tf_executor.control) {T = "tfdtype$DT_INT32", device = "", name = "while/Enter"} + %2 = tf_executor.island { + "tf.NoOp"() {device = "", name = "cluster/pivot"} : () -> () + tf_executor.yield + } + %3:3 = tf_executor.NextIteration.Source : tensor<*xi32> {T = "tfdtype$DT_INT32", device = "", id = 0 : i64, name = "while/NextIteration"} + %4:3 = tf_executor.Merge %3#0, %1#0 : tensor<*xi32> {N = 2 : i64, T = "tfdtype$DT_INT32", device = "", name = "while/Merge"} + %5:2 = tf_executor.island(%4#2) { + %cst = "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "while/Less/y", value = dense<2> : tensor} : () -> tensor + tf_executor.yield %cst : tensor + } + %6:2 = tf_executor.island { + %14 = "tf.Less"(%4#0, %5#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Less"} : (tensor<*xi32>, tensor) -> tensor<*xi1> + tf_executor.yield %14 : tensor<*xi1> + } + %7:2 = tf_executor.LoopCond %6#0 : (tensor<*xi1>) -> (tensor, !tf_executor.control) {device = "", name = "while/LoopCond"} + %8:3 = tf_executor.Switch %4#0, %7#0 : tensor<*xi32> {T = "tfdtype$DT_INT32", _class = ["loc = @while/Merge"], device = "", name = "while/Switch"} + %9:2 = tf_executor.Exit %8#0 : tensor<*xi32> {T = "tfdtype$DT_INT32", device = "", name = "while/Exit"} + %10:2 = tf_executor.island { + %14 = "tf.Identity"(%8#1) {T = "tfdtype$DT_INT32", device = "", name = "while/Identity"} : (tensor<*xi32>) -> tensor<*xi32> + tf_executor.yield %14 : tensor<*xi32> + } + %11:2 = tf_executor.island(%10#1) { + %cst = "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "while/Add/y", value = dense<3> : tensor} : () -> tensor + tf_executor.yield %cst : tensor + } + %12:2 = tf_executor.island { + %14 = "tf.Add"(%10#0, %11#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Add"} : (tensor<*xi32>, tensor) -> tensor<*xi32> + tf_executor.yield %14 : tensor<*xi32> + } + %13 = tf_executor.ControlTrigger %2, %12#1, %9#1 {_tpu_replicate = "cluster", device = "", name = "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync"} + tf_executor.NextIteration.Sink [%3#1] %12#0, %13 : tensor<*xi32> {T = "tfdtype$DT_INT32", device = "", id = 0 : i64, name = "while/NextIteration"} + tf_executor.fetch + } + return +} + +// CHECK-NEXT: %[[CONST:[0-9]*]]:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Const", value = dense<1> : tensor} : () -> (tensor, !_tf.control) +// CHECK-NEXT: %[[ENTER:[0-9]*]]:2 = "_tf.Enter"(%[[CONST]]#0) {T = "tfdtype$DT_INT32", device = "", frame_name = "while/while_context", is_constant = false, name = "while/Enter", parallel_iterations = 10 : i64} : (tensor) -> (tensor<*xi32>, !_tf.control) +// CHECK-NEXT: %[[NOOP:[0-9]*]] = "_tf.NoOp"() {device = "", name = "cluster/pivot"} : () -> !_tf.control +// CHECK-NEXT: %[[SOURCE:[0-9]*]]:2 = "_tf.NextIteration.source"() {T = "tfdtype$DT_INT32", device = "", id = 0 : i64, name = "while/NextIteration"} : () -> (tensor<*xi32>, !_tf.control) +// CHECK-NEXT: %[[MERGE:[0-9]*]]:3 = "_tf.Merge"(%[[SOURCE]]#0, %[[ENTER]]#0) {N = 2 : i64, T = "tfdtype$DT_INT32", device = "", name = "while/Merge"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor, !_tf.control) +// CHECK-NEXT: %[[CONST_LESS:[0-9]*]]:2 = "_tf.Const"(%[[MERGE]]#2) {device = "", dtype = "tfdtype$DT_INT32", name = "while/Less/y", value = dense<2> : tensor} : (!_tf.control) -> (tensor, !_tf.control) +// CHECK-NEXT: %[[LESS:[0-9]*]]:2 = "_tf.Less"(%[[MERGE]]#0, %[[CONST_LESS]]#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Less"} : (tensor<*xi32>, tensor) -> (tensor<*xi1>, !_tf.control) +// CHECK-NEXT: %[[COND:[0-9]*]]:2 = "_tf.LoopCond"(%[[LESS]]#0) {device = "", name = "while/LoopCond"} : (tensor<*xi1>) -> (tensor, !_tf.control) +// CHECK-NEXT: %[[SWITCH:[0-9]*]]:3 = "_tf.Switch"(%[[MERGE]]#0, %[[COND]]#0) {T = "tfdtype$DT_INT32", _class = ["loc = @while/Merge"], device = "", name = "while/Switch"} : (tensor<*xi32>, tensor) -> (tensor<*xi32>, tensor<*xi32>, !_tf.control) +// CHECK-NEXT: %[[EXIT:[0-9]*]]:2 = "_tf.Exit"(%[[SWITCH]]#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Exit"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) +// CHECK-NEXT: %[[IDENTITY:[0-9]*]]:2 = "_tf.Identity"(%[[SWITCH]]#1) {T = "tfdtype$DT_INT32", device = "", name = "while/Identity"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) +// CHECK-NEXT: %[[CONST_ADD:[0-9]*]]:2 = "_tf.Const"(%[[IDENTITY]]#1) {device = "", dtype = "tfdtype$DT_INT32", name = "while/Add/y", value = dense<3> : tensor} : (!_tf.control) -> (tensor, !_tf.control) +// CHECK-NEXT: %[[ADD:[0-9]*]]:2 = "_tf.Add"(%[[IDENTITY]]#0, %[[CONST_ADD]]#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Add"} : (tensor<*xi32>, tensor) -> (tensor<*xi32>, !_tf.control) +// CHECK-NEXT: %[[CT:[0-9]*]] = "_tf.ControlTrigger"(%[[NOOP]], %[[ADD]]#1, %[[EXIT]]#1) {_tpu_replicate = "cluster", device = "", name = "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync"} : (!_tf.control, !_tf.control, !_tf.control) -> !_tf.control +// CHECK-NEXT: %[[SINK:[0-9]*]] = "_tf.NextIteration.sink"(%[[ADD]]#0, %[[CT]]) {T = "tfdtype$DT_INT32", device = "", id = 0 : i64, name = "while/NextIteration"} : (tensor<*xi32>, !_tf.control) -> !_tf.control +// CHECK-NEXT: return + + + + +// CHECK-LABEL: func @multiple_ops_region +func @multiple_ops_region(%arg0 : tensor<*xi32>, %arg1 : tensor) { + tf_executor.graph { + %0:2 = tf_executor.island { + // The 4 operations are independent, but the current conversion will add + // control dependencies conservatively. + %1 = "tf.Add"(%arg0, %arg1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add1"} : (tensor<*xi32>, tensor) -> tensor<*xi32> + %2 = "tf.Add"(%arg0, %arg1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add2"} : (tensor<*xi32>, tensor) -> tensor<*xi32> + %3 = "tf.Add"(%arg0, %arg1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add3"} : (tensor<*xi32>, tensor) -> tensor<*xi32> + %4 = "tf.Add"(%arg0, %arg1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add4"} : (tensor<*xi32>, tensor) -> tensor<*xi32> + tf_executor.yield %4 : tensor<*xi32> + } + tf_executor.fetch + } + return +} + +// CHECK-NEXT: %[[ADD1:[0-9]*]]:2 = "_tf.Add"(%arg0, %arg1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add1"} : (tensor<*xi32>, tensor) -> (tensor<*xi32>, !_tf.control) +// CHECK-NEXT: %[[ADD2:[0-9]*]]:2 = "_tf.Add"(%arg0, %arg1, %[[ADD1]]#1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add2"} : (tensor<*xi32>, tensor, !_tf.control) -> (tensor<*xi32>, !_tf.control) +// CHECK-NEXT: %[[ADD3:[0-9]*]]:2 = "_tf.Add"(%arg0, %arg1, %[[ADD2]]#1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add3"} : (tensor<*xi32>, tensor, !_tf.control) -> (tensor<*xi32>, !_tf.control) +// CHECK-NEXT: %[[ADD4:[0-9]*]]:2 = "_tf.Add"(%arg0, %arg1, %[[ADD3]]#1) {T = "tfdtype$DT_INT32", device = "", name = "while/Add4"} : (tensor<*xi32>, tensor, !_tf.control) -> (tensor<*xi32>, !_tf.control) diff --git a/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc b/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc index 4d9b3ca7ab7..507d077af02 100644 --- a/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc +++ b/tensorflow/compiler/mlir/tensorflow/translate/control_to_executor_dialect.cc @@ -155,7 +155,7 @@ void ControlToExecutorDialectConversion::runOnFunction() { loc, types, operands, ArrayRef{}); } else if (op.getName().getStringRef() == "_tf.NextIteration.source") { replacement = builder.create( - loc, op.getResult(0)->getType(), operands); + loc, op.getResult(0)->getType()); // Record a mapping of the name to the nextiteration.source so that when // we convert the sink we can get the token. StringAttr frame = op.getAttrOfType("name"); diff --git a/tensorflow/compiler/mlir/tensorflow/translate/executor_to_control_dialect.cc b/tensorflow/compiler/mlir/tensorflow/translate/executor_to_control_dialect.cc new file mode 100644 index 00000000000..546898fe389 --- /dev/null +++ b/tensorflow/compiler/mlir/tensorflow/translate/executor_to_control_dialect.cc @@ -0,0 +1,204 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// This transformation pass transforms from TF executor dialect to MLIR TF +// contol dialect. + +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Sequence.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "mlir/IR/Builders.h" // TF:local_config_mlir +#include "mlir/IR/Operation.h" // TF:local_config_mlir +#include "mlir/IR/Value.h" // TF:local_config_mlir +#include "mlir/Pass/Pass.h" // TF:local_config_mlir +#include "mlir/Pass/PassRegistry.h" // TF:local_config_mlir +#include "mlir/StandardOps/Ops.h" // TF:local_config_mlir +#include "mlir/Support/LLVM.h" // TF:local_config_mlir +#include "tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h" +#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h" +#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h" +#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h" + +#define DEBUG_TYPE "tf-executor-to-ctl" + +namespace mlir { + +namespace { +struct ExecutorToControlDialectConversion + : public FunctionPass { + void runOnFunction() override; +}; +} // end anonymous namespace + +static bool HasSingleGraph(FuncOp function) { + if (function.getBlocks().size() != 1) return false; + if (!std::next(function.begin()->begin())->isKnownTerminator()) return false; + if (!isa(function.begin()->begin())) return false; + return true; +} + +void ExecutorToControlDialectConversion::runOnFunction() { + if (!HasSingleGraph(getFunction())) { + LLVM_DEBUG(llvm::dbgs() + << "Expect a Function with a single block and a single graph op," + " skip tf_executor dialect conversion\n"); + return; + } + Type control_type = TFControlFlow::TFControlType::get(&getContext()); + + Block &body = getFunction().front(); + OpBuilder builder(&body, body.begin()); + auto graph = cast(body.front()); + SmallString<64> new_op_name; + for (auto &op : llvm::make_early_inc_range(graph.GetBody())) { + LLVM_DEBUG(llvm::dbgs() << "Process: " << op.getName() << "\n"); + if (auto fetch = dyn_cast(op)) { + // Replace all the operands of the fetch op with the uses of the graph + // results, the graph op will then be removed. + for (auto ops_and_ret_vals : + llvm::zip(graph.getResults(), fetch.getOperands())) + std::get<0>(ops_and_ret_vals) + ->replaceAllUsesWith(std::get<1>(ops_and_ret_vals)); + continue; + } + if (auto island = dyn_cast(op)) { + Value *ctl_sequence = nullptr; + Operation *last_replaced_op = nullptr; + for (Operation &wrapped_op : island.GetBody()) { + LLVM_DEBUG(llvm::dbgs() + << " In island: " << wrapped_op.getName() << "\n"); + if (isa(wrapped_op)) { + for (auto ops_and_ret_vals : + llvm::zip(island.getResults(), wrapped_op.getOperands())) + std::get<0>(ops_and_ret_vals) + ->replaceAllUsesWith(std::get<1>(ops_and_ret_vals)); + break; + } + // Add a leading _ off the name. + new_op_name = "_"; + new_op_name += wrapped_op.getName().getStringRef(); + OperationState state(wrapped_op.getLoc(), new_op_name); + + // Add an operand for each non-control input we find. Collect control + // values separately to add them to the island operands + state.operands.append(wrapped_op.getOperands().begin(), + wrapped_op.getOperands().end()); + + // Chain operations through a control dependency, except for the first + // operations in the sequence that carry the control dependencies held + // by the island itself. + if (ctl_sequence) { + state.operands.push_back(ctl_sequence); + } else { + for (Value *ctl_operand : island.getOperands()) + state.operands.push_back(ctl_operand); + } + + // Add a result type for each result + state.types.append(wrapped_op.getResultTypes().begin(), + wrapped_op.getResultTypes().end()); + state.types.push_back(control_type); + + // Create the replacement operation. + auto *replacement = builder.createOperation(state); + replacement->setAttrs(wrapped_op.getAttrList()); + + for (auto ops_and_ret_vals : + llvm::zip(wrapped_op.getResults(), replacement->getResults())) + std::get<0>(ops_and_ret_vals) + ->replaceAllUsesWith(std::get<1>(ops_and_ret_vals)); + + ctl_sequence = replacement->getResult(replacement->getNumResults() - 1); + last_replaced_op = replacement; + } + for (Value *island_ctl : island.getResults()) + island_ctl->replaceAllUsesWith( + last_replaced_op->getResult(last_replaced_op->getNumResults() - 1)); + op.erase(); + continue; + } + + new_op_name.clear(); + if (isa(op)) { + new_op_name = "_tf.Switch"; + } else if (isa(op)) { + new_op_name = "_tf.SwitchN"; + } else if (isa(op)) { + new_op_name = "_tf.Merge"; + } else if (isa(op)) { + new_op_name = "_tf.NextIteration.source"; + } else if (isa(op)) { + new_op_name = "_tf.NextIteration.sink"; + } else if (isa(op)) { + new_op_name = "_tf.LoopCond"; + } else if (isa(op)) { + new_op_name = "_tf.Enter"; + } else if (isa(op)) { + new_op_name = "_tf.Exit"; + } else if (isa(op)) { + new_op_name = "_tf.ControlTrigger"; + } else { + op.emitOpError() << "unhandled op in tf_executor to _tf conversion"; + return signalPassFailure(); + } + OperationState state(op.getLoc(), new_op_name); + // Token results are dropped when we process the source op, the operand + // becomes nullptr by the time we process the sink op, filter it out here. + auto non_null_operands = + llvm::make_filter_range(op.getOperands(), [](Value *v) { return v; }); + state.operands.append(non_null_operands.begin(), non_null_operands.end()); + for (Type result_type : op.getResultTypes()) { + // Filter out TokenType, they don't exist in the control dialect. + if (result_type.isa()) continue; + if (!result_type.isa()) + state.types.push_back(result_type); + else + state.types.push_back(control_type); + } + // The control dialect has a control result for the sink operation. + if (isa(op)) + state.types.push_back(control_type); + + // Create the replacement operation. + auto *replacement = builder.createOperation(state); + replacement->setAttrs(op.getAttrList()); + + if (auto next_iteration = + dyn_cast(op)) { + next_iteration.output()->replaceAllUsesWith(replacement->getResult(0)); + next_iteration.token()->dropAllUses(); + next_iteration.control()->replaceAllUsesWith(replacement->getResult(1)); + } else { + for (auto ops_and_ret_vals : + llvm::zip(op.getResults(), replacement->getResults())) + std::get<0>(ops_and_ret_vals) + ->replaceAllUsesWith(std::get<1>(ops_and_ret_vals)); + } + op.erase(); + } + graph.erase(); +} + +FunctionPassBase *CreateTFExecutorToControlDialectConversion() { + return new ExecutorToControlDialectConversion(); +} + +} // namespace mlir + +static mlir::PassRegistration pass( + "tf-executor-to-control-conversion", + "Convert from TF executor dialect to TF control dialect"); diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc index 5c7b1e824fe..cd4878112ae 100644 --- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc +++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc @@ -26,6 +26,7 @@ limitations under the License. #include "mlir/IR/Operation.h" // TF:local_config_mlir #include "mlir/IR/StandardTypes.h" // TF:local_config_mlir #include "mlir/Parser.h" // TF:local_config_mlir +#include "mlir/Pass/PassManager.h" // TF:local_config_mlir #include "tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.h" #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h" #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h" @@ -35,6 +36,14 @@ limitations under the License. #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/protobuf/graph_debug_info.pb.h" +namespace mlir { +/// Create a pass to convert from the TF control to the TFExecutor dialect. +FunctionPassBase* CreateTFControlToExecutorDialectConversion(); + +/// Create a pass to convert from the TFExecutor to the TF control dialect. +FunctionPassBase* CreateTFExecutorToControlDialectConversion(); +} // namespace mlir + namespace tensorflow { using stream_executor::port::Status; @@ -80,6 +89,20 @@ mlir::OwningModuleRef GraphdefToMlirTranslateFunction( LOG(ERROR) << "Graph import failed: " << module_or.status(); return nullptr; } + + // Round-trip to the tf_executor dialect, this is temporary while bringing up + // the new dialect. + { + mlir::PassManager pm; + pm.addPass(mlir::CreateTFControlToExecutorDialectConversion()); + pm.addPass(mlir::CreateTFExecutorToControlDialectConversion()); + if (failed(pm.run(module_or.ValueOrDie().get()))) { + module_or.ValueOrDie()->emitOpError() + << "Round-trip to tf_executor dialect failed"; + return nullptr; + } + } + return module_or.ConsumeValueOrDie(); } From fca2509e3b3d6252fa34f6e35d8a359c0e5cbf64 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 20 Jul 2019 19:03:59 +0000 Subject: [PATCH 0236/3053] Format tf.function's error message when input and signature does not match This fix tries to address the issue raised in 30576 where the error message is hard to interpret: ``` ValueError: Python inputs incompatible with input_signature: inputs ((, , , , , , , , )), input_signature ((TensorSpec(shape=(?, ?, 1), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 2), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 3), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 4), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 5), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 6), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 7), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 8), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 9), dtype=tf.float32, name=None))) ``` This fix formats the error message: ``` ValueError: Python inputs incompatible with input_signature: inputs: ( Tensor("random_normal:0", shape=(1, 123, 1), dtype=float32), Tensor("random_normal_1:0", shape=(1, 123, 2), dtype=float32), Tensor("random_normal_2:0", shape=(1, 123, 3), dtype=float32), Tensor("random_normal_3:0", shape=(1, 123, 4), dtype=float32), Tensor("random_normal_4:0", shape=(1, 123, 5), dtype=float32), Tensor("random_normal_5:0", shape=(1, 123, 6), dtype=float32), Tensor("random_normal_6:0", shape=(1, 123, 7), dtype=float32), Tensor("random_normal_7:0", shape=(1, 123, 8), dtype=float32), Tensor("random_normal_8:0", shape=(1, 123, 1), dtype=float32)) input_signature: ( TensorSpec(shape=(?, ?, 1), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 2), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 3), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 4), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 5), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 6), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 7), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 8), dtype=tf.float32, name=None), TensorSpec(shape=(?, ?, 9), dtype=tf.float32, name=None)) ``` This fix fixes 30576. Signed-off-by: Yong Tang --- tensorflow/python/eager/function.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py index f8fd53ec83d..420d3dd6027 100644 --- a/tensorflow/python/eager/function.py +++ b/tensorflow/python/eager/function.py @@ -1548,12 +1548,17 @@ def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature): "Inputs (%s), input_signature(%s)." % (str(inputs), str(input_signature))) + def format_error_message(inputs, input_signature): + return (" inputs: (\n " + + ",\n ".join([str(i) for i in inputs]) + + ")\n input_signature: (\n " + + ",\n ".join([str(i) for i in input_signature]) + + ")") if any(not spec.is_compatible_with(other) for spec, other in zip( flat_input_signature, flatten_inputs)): - raise ValueError("Python inputs incompatible with input_signature: " - "inputs (%s), input_signature (%s)" % - (str(inputs), str(input_signature))) + raise ValueError("Python inputs incompatible with input_signature:\n%s" % + format_error_message(inputs, input_signature)) if need_packing: inputs = nest.pack_sequence_as( From 02e7e30343af69ea4c9bcf0169862c155b4f66c8 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 20 Jul 2019 19:11:12 +0000 Subject: [PATCH 0237/3053] Also format all related ValueError when input and input_signature are needed Signed-off-by: Yong Tang --- tensorflow/python/eager/function.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py index 420d3dd6027..95f52de95e2 100644 --- a/tensorflow/python/eager/function.py +++ b/tensorflow/python/eager/function.py @@ -1521,6 +1521,12 @@ def _convert_numpy_inputs(inputs): def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature): """Convert inputs to pass into a function with an explicit signature.""" + def format_error_message(inputs, input_signature): + return (" inputs: (\n " + + ",\n ".join([str(i) for i in inputs]) + + ")\n input_signature: (\n " + + ",\n ".join([str(i) for i in input_signature]) + + ")") try: # TODO(b/124370185): Use all elements as inputs to throw an error if there # are ignored arguments. Calling with arguments that are not part of the @@ -1531,8 +1537,8 @@ def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature): expand_composites=True) except ValueError: raise ValueError("Structure of Python function inputs does not match " - "input_signature. Inputs (%s), input_signature(%s)." % - (str(inputs), str(input_signature))) + "input_signature:\n" % + format_error_message(inputs, input_signature)) need_packing = False for index, (value, spec) in enumerate(zip(flatten_inputs, @@ -1544,16 +1550,10 @@ def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature): need_packing = True except ValueError: raise ValueError("When input_signature is provided, all inputs to " - "the Python function must be convertible to tensors." - "Inputs (%s), input_signature(%s)." % - (str(inputs), str(input_signature))) + "the Python function must be convertible to " + "tensors:\n" % + format_error_message(inputs, input_signature)) - def format_error_message(inputs, input_signature): - return (" inputs: (\n " + - ",\n ".join([str(i) for i in inputs]) + - ")\n input_signature: (\n " + - ",\n ".join([str(i) for i in input_signature]) + - ")") if any(not spec.is_compatible_with(other) for spec, other in zip( flat_input_signature, flatten_inputs)): From 84ed39ed5109d39f4ec22a50bba1170b61170c7b Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 20 Jul 2019 19:17:24 +0000 Subject: [PATCH 0238/3053] Pylint fix Signed-off-by: Yong Tang --- tensorflow/python/eager/function.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py index 95f52de95e2..c6b1f33068c 100644 --- a/tensorflow/python/eager/function.py +++ b/tensorflow/python/eager/function.py @@ -1522,11 +1522,15 @@ def _convert_numpy_inputs(inputs): def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature): """Convert inputs to pass into a function with an explicit signature.""" def format_error_message(inputs, input_signature): - return (" inputs: (\n " + - ",\n ".join([str(i) for i in inputs]) + - ")\n input_signature: (\n " + - ",\n ".join([str(i) for i in input_signature]) + - ")") + return (" inputs: (\n" + + " " + + ",\n ".join([str(i) for i in inputs]) + + ")\n" + + " input_signature: (\n" + + " " + + ",\n ".join([str(i) for i in input_signature]) + + ")") + try: # TODO(b/124370185): Use all elements as inputs to throw an error if there # are ignored arguments. Calling with arguments that are not part of the From 15760703cb0253749f75dd4afb75854cb72dee52 Mon Sep 17 00:00:00 2001 From: Ayush Dubey Date: Sat, 20 Jul 2019 12:32:40 -0700 Subject: [PATCH 0239/3053] Insert identity after all uses of an output that is repeated across candidate op groups for scoped allocator optimizer. cl/257291024 introduced a technique of adding identity ops whenever an output was consumed by ops in different scopes and those ops were being optimized by the scoped allocator optimizer. However, that change introduced the identity for every use after the first one. For example, if output `o` was consumed by nodes `n1` and `scope/n2`, the optimizer would insert an identity between `o` and `scope/n2` but `n1` would continue to have `o` as a direct input. This introduced the following data race: after the optimizer runs, `o` will be a slice of a scope allocated buffer, and the identity would read the same slice. However, the entire buffer would itself be consumed by a different op, and that op may write to the buffer while the identity reads from the buffer. This change fixes the race by adding an identity node between an output and *all* its consumers. This means in the previous example, the optimizer would introduce an identity between both (`o`, `n1`) and (`o`, `scope/n2`). PiperOrigin-RevId: 259138773 --- tensorflow/core/grappler/optimizers/BUILD | 3 +- .../optimizers/scoped_allocator_optimizer.cc | 51 ++++++++++++------- .../optimizers/scoped_allocator_optimizer.h | 19 +++++-- 3 files changed, 49 insertions(+), 24 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD index 65a8b52c05b..50036a56d1d 100644 --- a/tensorflow/core/grappler/optimizers/BUILD +++ b/tensorflow/core/grappler/optimizers/BUILD @@ -893,6 +893,8 @@ cc_library( "//tensorflow/core/grappler:utils", "//tensorflow/core/grappler/costs:graph_properties", "//tensorflow/core/grappler/utils:frame", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", ], ) @@ -900,7 +902,6 @@ tf_cc_test( name = "scoped_allocator_optimizer_test", size = "small", srcs = ["scoped_allocator_optimizer_test.cc"], - tags = ["notsan"], # TODO(b/137795054): re-enable after fixing race. deps = [ ":scoped_allocator_optimizer", "//tensorflow/cc:cc_ops", diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc index 29bc154eb0e..c8c9096eb07 100644 --- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc @@ -172,11 +172,11 @@ Status RemoveEdge(const string& input_edge_name, const string& from_node_name, // If `input` is an Exit node, we add an identity to avoid the case when Exit // has inputs from different frames. // -// If `input` has kScopedAllocatorAttrName attribute, this means that it was -// previously marked for allocation with a different scope id. Since there can -// be only one scope id per output, we insert an identity between the input and -// op. This will ensure that the identity becomes the new input to op, and this -// identity can be marked with a new scope id different from `input`. +// If `input` is in `sa_opti->repeated_outputs()`, this means that it will be +// potentially used by multiple scope ids. Since there can be only one scope id +// per output, we insert an identity between the input and op. This will ensure +// that the identity becomes the new input to op, and this identity can be +// marked with a new scope id different from `input`. // // If the graph is rewritten, this function will perform the following change: // @@ -196,16 +196,9 @@ Status MaybeRewriteInput(ScopedAllocatorOptimizer* sa_opti, NodeDef* input, const string& edge_name, int output_index, NodeDef* op, NodeDef** new_input, int* new_output_index) { - bool rewrite = false; - if (IsExit(*input)) { - rewrite = true; - } else { - AttrSlice input_attrs = AttrSlice(*input); - std::vector scopes; - Status sa_status = - GetNodeAttr(input_attrs, kScopedAllocatorAttrName, &scopes); - rewrite = sa_status.ok(); - } + bool rewrite = + IsExit(*input) || (sa_opti->repeated_outputs().find(edge_name) != + sa_opti->repeated_outputs().end()); if (!rewrite) { *new_input = input; *new_output_index = output_index; @@ -783,7 +776,7 @@ Status ScopedAllocatorOptimizer::Optimize(Cluster* /*cluster*/, assume_valid_feeds, /*aggressive_shape_inference=*/false, /*include_tensor_values=*/false)); *optimized_graph = item.graph; - node_map_.reset(new NodeMap(optimized_graph)); + node_map_ = absl::make_unique(optimized_graph); LOG_WARNING_AND_RETURN_IF_ERROR(ScopedAllocatorOptimizer::ProcessGraphDef( optimized_graph, graph_properties)); @@ -869,7 +862,7 @@ class Tree { string edge_; int depth_; std::vector nodes_; - std::unordered_map subtrees_; + absl::flat_hash_map subtrees_; }; // Applies a function to every Tree in DFS order. Terminates early @@ -905,7 +898,7 @@ void PartitionByLoopStructure(const FrameView& frame_view, std::vector>* loop_groups) { // It is assumed that two nodes with identical loop containment have // identical integer vectors. Represent those by 64 bit hashes. - std::unordered_map> loop_sets; + absl::flat_hash_map> loop_sets; for (NodeDef* nd : nodes) { uint64 hash = 0; const std::vector& loop_ids = frame_view.Frames(*nd); @@ -919,6 +912,19 @@ void PartitionByLoopStructure(const FrameView& frame_view, } } +// Identify outputs that are inputs to multiple sets of nodes. +void IdentifyRepeatedInputs(const std::vector& nodes, + absl::flat_hash_set* seen_outputs, + absl::flat_hash_set* repeated_outputs) { + for (NodeDef* node : nodes) { + for (const auto& input_name : node->input()) { + if (!seen_outputs->insert(input_name).second) { + repeated_outputs->insert(input_name); + } + } + } +} + } // namespace Status ScopedAllocatorOptimizer::ProcessGraphDef( @@ -954,6 +960,15 @@ Status ScopedAllocatorOptimizer::ProcessGraphDef( } rewriter->SetGraphProperties(graph_properties); std::unique_ptr root(ComputeScopeTree(it.first, it.second)); + // Record outputs that are inputs to multiple Tree nodes. + absl::flat_hash_set seen_outputs; + status = ApplyToAll(root.get(), [this, &seen_outputs](Tree* t) { + IdentifyRepeatedInputs(t->nodes_, &seen_outputs, &repeated_outputs_); + return Status::OK(); + }); + if (!status.ok()) { + break; + } // Nodes with a common depth and root path are now grouped // in the same Tree struct. Split those groups into subgroups that // share identical loop nesting. diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h index 20c29a56446..2aaf461591d 100644 --- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h +++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h @@ -16,10 +16,11 @@ limitations under the License. #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SCOPED_ALLOCATOR_OPTIMIZER_H_ #include -#include #include #include +#include "absl/container/flat_hash_map.h" +#include "absl/container/flat_hash_set.h" #include "tensorflow/core/grappler/optimizers/graph_optimizer.h" #include "tensorflow/core/grappler/utils.h" #include "tensorflow/core/protobuf/rewriter_config.pb.h" @@ -49,10 +50,10 @@ class ScopedAllocatorOptimizer : public GraphOptimizer { const GraphDef& optimized_graph, double result) override {} // Map from an Op name to a vector of Nodes with that Op. - typedef std::unordered_map> DevOpOccurrences; + typedef absl::flat_hash_map> DevOpOccurrences; // Map from a device name to a DevOpOccurrences map. - typedef std::unordered_map GraphOpOccurrences; - typedef std::unordered_set OpNameSet; + typedef absl::flat_hash_map GraphOpOccurrences; + typedef absl::flat_hash_set OpNameSet; Status ProcessGraphDef(GraphDef* graph, const GraphProperties& graph_properties); @@ -72,6 +73,10 @@ class ScopedAllocatorOptimizer : public GraphOptimizer { NodeMap* node_map() { return node_map_.get(); } + const absl::flat_hash_set& repeated_outputs() { + return repeated_outputs_; + } + // Appends values to the attr value under name in node_def, if present. // If not present does an assignment. static void ExtendNodeAttr(StringPiece name, const std::vector& values, @@ -106,11 +111,15 @@ class ScopedAllocatorOptimizer : public GraphOptimizer { RewriterConfig::Toggle opt_level_; std::unordered_set nodes_to_preserve_; OpNameSet op_name_set_; - std::unordered_map rewriters_; + absl::flat_hash_map rewriters_; std::vector to_delete_; int next_sa_id_ = 1; int next_identity_id_ = 1; std::unique_ptr node_map_; + // Keeps track of outputs, i.e. a node and an output index, that are inputs to + // more than one op groups that are candidates for scoped allocator + // optimization. + absl::flat_hash_set repeated_outputs_; }; } // namespace grappler From c29c55f99361439648176d5750d41a4146663f04 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 20 Jul 2019 20:05:35 +0000 Subject: [PATCH 0240/3053] Pass name to tf.cast during the conversion, part of the review feedback Signed-off-by: Yong Tang --- tensorflow/python/ops/math_ops.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 2b6267fc635..833e2cf72ed 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -1356,9 +1356,9 @@ def range(start, limit=None, delta=1, dtype=None, name="range"): # pylint: disa # which is comparable with: # np.arange(np.int(5), dtype=np.float32) if dtype is not None: - start = cast(start, dtype=dtype) - limit = cast(limit, dtype=dtype) - delta = cast(delta, dtype=dtype) + start = cast(start, dtype=dtype, name="start") + limit = cast(limit, dtype=dtype, name="limit") + delta = cast(delta, dtype=dtype, name="delta") else: start = ops.convert_to_tensor(start, name="start") limit = ops.convert_to_tensor(limit, name="limit") From 3ddc727f16df1007daa183c7308b3bd440b7061a Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 20 Jul 2019 20:16:18 +0000 Subject: [PATCH 0241/3053] Fix test failures Signed-off-by: Yong Tang --- tensorflow/python/eager/function.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py index c6b1f33068c..84a8ae49d47 100644 --- a/tensorflow/python/eager/function.py +++ b/tensorflow/python/eager/function.py @@ -1541,7 +1541,7 @@ def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature): expand_composites=True) except ValueError: raise ValueError("Structure of Python function inputs does not match " - "input_signature:\n" % + "input_signature:\n%s" % format_error_message(inputs, input_signature)) need_packing = False @@ -1555,7 +1555,7 @@ def _convert_inputs_to_signature(inputs, input_signature, flat_input_signature): except ValueError: raise ValueError("When input_signature is provided, all inputs to " "the Python function must be convertible to " - "tensors:\n" % + "tensors:\n%s" % format_error_message(inputs, input_signature)) if any(not spec.is_compatible_with(other) for spec, other in zip( From eedf79ed3782dddd1c4787c72fc9804a20252245 Mon Sep 17 00:00:00 2001 From: Yu-Cheng Ling Date: Sat, 20 Jul 2019 15:45:31 -0700 Subject: [PATCH 0242/3053] Graduate TFLite control flow ops from experimental to builtin PiperOrigin-RevId: 259150573 --- .../mlir/lite/flatbuffer_translate.cc | 153 ++++----- .../lite/tests/mlir2flatbuffer/if_op.mlir | 18 +- .../lite/tests/mlir2flatbuffer/while_op.mlir | 15 +- tensorflow/lite/builtin_ops.h | 2 + tensorflow/lite/c/builtin_op_data.h | 10 + .../lite/core/api/flatbuffer_conversions.cc | 18 ++ .../writer/option_writer_generator.cc | 2 + tensorflow/lite/kernels/if.cc | 11 +- tensorflow/lite/kernels/register.cc | 8 +- tensorflow/lite/kernels/subgraph_test_util.cc | 32 +- tensorflow/lite/kernels/while.cc | 11 +- tensorflow/lite/schema/schema.fbs | 16 +- tensorflow/lite/schema/schema_generated.h | 304 +++++++++++++++++- 13 files changed, 471 insertions(+), 129 deletions(-) diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc index c6a461d7414..ab17d62fa53 100644 --- a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc +++ b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc @@ -368,9 +368,14 @@ class Translator { const std::string& name, unsigned buffer_idx); - CustomOptionsOffset CreateIfOpCustomOptions(mlir::TF::IfOp op); - - CustomOptionsOffset CreateWhileOpCustomOptions(mlir::TF::WhileOp op); + // TODO(b/137395003): Legalize control flow ops to TFLite dialect, and remove + // these 2 functions here. + BufferOffset BuildIfOperator( + mlir::TF::IfOp op, const std::vector& operands, + const std::vector& results); + BufferOffset BuildWhileOperator( + mlir::TF::WhileOp op, const std::vector& operands, + const std::vector& results); Optional CreateFlexOpCustomOptions( const ::tensorflow::NodeDef& node_def, const mlir::Location& loc); @@ -544,31 +549,36 @@ Optional> Translator::BuildTensor( builder_.CreateString(name), q_params, /*is_variable=*/false); } -CustomOptionsOffset Translator::CreateIfOpCustomOptions(mlir::TF::IfOp op) { +BufferOffset Translator::BuildIfOperator( + mlir::TF::IfOp op, const std::vector& operands, + const std::vector& results) { + auto opcode_index = GetOpcodeIndex("if", tflite::BuiltinOperator_IF); int then_subgraph_index = subgraph_index_map_.at(op.then_branch().str()); int else_subgraph_index = subgraph_index_map_.at(op.else_branch().str()); - - auto flex_builder = absl::make_unique(); - flex_builder->Map([&]() { - flex_builder->Int("then_subgraph_index", then_subgraph_index); - flex_builder->Int("else_subgraph_index", else_subgraph_index); - }); - flex_builder->Finish(); - return builder_.CreateVector(flex_builder->GetBuffer()); + auto builtin_options = tflite::CreateIfOptions(builder_, then_subgraph_index, + else_subgraph_index) + .Union(); + auto inputs = builder_.CreateVector(operands); + auto outputs = builder_.CreateVector(results); + return tflite::CreateOperator(builder_, opcode_index, inputs, outputs, + tflite::BuiltinOptions_IfOptions, + builtin_options); } -CustomOptionsOffset Translator::CreateWhileOpCustomOptions( - mlir::TF::WhileOp op) { +BufferOffset Translator::BuildWhileOperator( + mlir::TF::WhileOp op, const std::vector& operands, + const std::vector& results) { + auto opcode_index = GetOpcodeIndex("while", tflite::BuiltinOperator_WHILE); int cond_subgraph_index = subgraph_index_map_.at(op.cond().str()); int body_subgraph_index = subgraph_index_map_.at(op.body().str()); - - auto flex_builder = absl::make_unique(); - flex_builder->Map([&]() { - flex_builder->Int("cond_subgraph_index", cond_subgraph_index); - flex_builder->Int("body_subgraph_index", body_subgraph_index); - }); - flex_builder->Finish(); - return builder_.CreateVector(flex_builder->GetBuffer()); + auto builtin_options = tflite::CreateWhileOptions( + builder_, cond_subgraph_index, body_subgraph_index) + .Union(); + auto inputs = builder_.CreateVector(operands); + auto outputs = builder_.CreateVector(results); + return tflite::CreateOperator(builder_, opcode_index, inputs, outputs, + tflite::BuiltinOptions_WhileOptions, + builtin_options); } Optional Translator::CreateFlexOpCustomOptions( @@ -712,63 +722,60 @@ Optional> Translator::BuildOperator( if (dialect == tf_dialect_) { std::string op_name; + if (auto ifOp = dyn_cast(inst)) { + return BuildIfOperator(ifOp, operands, results); + } else if (auto whileOp = dyn_cast(inst)) { + return BuildWhileOperator(whileOp, operands, results); + } + CustomOptionsOffset custom_options; - if (auto ifOp = dyn_cast(inst)) { - op_name = "Experimental_If"; - custom_options = CreateIfOpCustomOptions(ifOp); - } else if (auto whileOp = dyn_cast(inst)) { - op_name = "Experimental_While"; - custom_options = CreateWhileOpCustomOptions(whileOp); - } else { - // Ops in TF dialect can either be custom ops or flex ops. - // The reason we go directly from TensorFlow dialect MLIR to tensorflow - // node instead of going to TF table gen'd ops via generated code is that - // we do not want to restrict custom and flex op conversion support to - // only those TF ops that are currently registered in MLIR. The current - // model is of an open op system. - // - // The following algorithm is followed: - // if flex is enabled and the op is whitelisted as flex - // we emit op as flex. - // if custom is enabled - // we emit the op as custom. - auto node_def = getTensorFlowNodeDef(inst); - if (!node_def) { + // Ops in TF dialect can either be custom ops or flex ops. + // The reason we go directly from TensorFlow dialect MLIR to tensorflow + // node instead of going to TF table gen'd ops via generated code is that + // we do not want to restrict custom and flex op conversion support to + // only those TF ops that are currently registered in MLIR. The current + // model is of an open op system. + // + // The following algorithm is followed: + // if flex is enabled and the op is whitelisted as flex + // we emit op as flex. + // if custom is enabled + // we emit the op as custom. + auto node_def = getTensorFlowNodeDef(inst); + if (!node_def) { + return llvm::None; + } + + // Flex op case + // Eventually, the whitelist will go away and we will rely on some TF op + // trait (e.g. No side effect) to determine if it is a supported "Flex" + // op or not. + if (enabled_op_types_.contains(OpType::kSelectTf) && + IsWhitelistedFlexOp(node_def->op())) { + // Construct ops as flex op encoding TensorFlow node definition + // as custom options. + // Flex ops are named with the kFlexOpNamePrefix prefix to the actual + // TF op name. + op_name = std::string(kFlexOpNamePrefix) + node_def->op(); + if (auto options = CreateFlexOpCustomOptions(*node_def, inst->getLoc())) { + custom_options = *options; + } else { return llvm::None; } - - // Flex op case - // Eventually, the whitelist will go away and we will rely on some TF op - // trait (e.g. No side effect) to determine if it is a supported "Flex" - // op or not. - if (enabled_op_types_.contains(OpType::kSelectTf) && - IsWhitelistedFlexOp(node_def->op())) { - // Construct ops as flex op encoding TensorFlow node definition - // as custom options. - // Flex ops are named with the kFlexOpNamePrefix prefix to the actual - // TF op name. - op_name = std::string(kFlexOpNamePrefix) + node_def->op(); - if (auto options = - CreateFlexOpCustomOptions(*node_def, inst->getLoc())) { - custom_options = *options; - } else { - return llvm::None; - } - } else if (enabled_op_types_.contains(OpType::kCustomOp)) { - // Generic case of custom ops - write using flex buffers since that - // is the only custom options supported by TFLite today. - op_name = node_def->op(); - if (auto options = - CreateCustomOpCustomOptions(*node_def, inst->getLoc())) { - custom_options = *options; - } else { - return llvm::None; - } + } else if (enabled_op_types_.contains(OpType::kCustomOp)) { + // Generic case of custom ops - write using flex buffers since that + // is the only custom options supported by TFLite today. + op_name = node_def->op(); + if (auto options = + CreateCustomOpCustomOptions(*node_def, inst->getLoc())) { + custom_options = *options; } else { - return inst->emitOpError("is neither a custom op nor a flex op"), - llvm::None; + return llvm::None; } + } else { + return inst->emitOpError("is neither a custom op nor a flex op"), + llvm::None; } uint32_t opcode_index = diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir index 7702045547e..03048bd640d 100644 --- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir +++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir @@ -1,12 +1,12 @@ -// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s +// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s --dump-input-on-failure + // CHECK: { // CHECK-NEXT: version: 3, // CHECK-NEXT: operator_codes: [ { // CHECK-NEXT: builtin_code: LESS // CHECK-NEXT: }, { -// CHECK-NEXT: builtin_code: CUSTOM, -// CHECK-NEXT: custom_code: "Experimental_If" +// CHECK-NEXT: builtin_code: IF // CHECK-NEXT: }, { // CHECK-EMPTY: // CHECK-NEXT: }, { @@ -52,8 +52,12 @@ // CHECK-NEXT: opcode_index: 1, // CHECK-NEXT: inputs: [ 2, 0, 1 ], // CHECK-NEXT: outputs: [ 3 ], -// CHECK-NEXT: custom_options: [ 116, 104, 101, 110, 95, 115, 117, 98, 103, 114, 97, 112, 104, 95, 105, 110, 100, 101, 120, 0, 101, 108, 115, 101, 95, 115, 117, 98, 103, 114, 97, 112, 104, 95, 105, 110, 100, 101, 120, 0, 2, 21, 42, 2, 1, 2, 2, 1, 4, 4, 4, 36, 1 ] -// CHECK-NEXT: } ] +// CHECK-NEXT: builtin_options_type: IfOptions, +// CHECK-NEXT: builtin_options: { +// CHECK-NEXT: then_subgraph_index: 1, +// CHECK-NEXT: else_subgraph_index: 2 +// CHECK-NEXT: } +// CHECK-NEXT: } ], // CHECK-NEXT: name: "main" // CHECK-NEXT: }, { // CHECK-NEXT: tensors: [ { @@ -88,7 +92,7 @@ // CHECK-NEXT: builtin_options: { // CHECK-EMPTY: // CHECK-NEXT: } -// CHECK-NEXT: } ] +// CHECK-NEXT: } ], // CHECK-NEXT: name: "cond_true" // CHECK-NEXT: }, { // CHECK-NEXT: tensors: [ { @@ -123,7 +127,7 @@ // CHECK-NEXT: builtin_options: { // CHECK-EMPTY: // CHECK-NEXT: } -// CHECK-NEXT: } ] +// CHECK-NEXT: } ], // CHECK-NEXT: name: "cond_false" // CHECK-NEXT: } ], // CHECK-NEXT: description: "MLIR Converted.", diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir index fd403aa72c5..117f97455cc 100644 --- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir +++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/while_op.mlir @@ -3,8 +3,7 @@ // CHECK: { // CHECK-NEXT: version: 3, // CHECK-NEXT: operator_codes: [ { -// CHECK-NEXT: builtin_code: CUSTOM, -// CHECK-NEXT: custom_code: "Experimental_While" +// CHECK-NEXT: builtin_code: WHILE // CHECK-NEXT: }, { // CHECK-NEXT: builtin_code: GREATER // CHECK-NEXT: }, { @@ -49,8 +48,12 @@ // CHECK-NEXT: operators: [ { // CHECK-NEXT: inputs: [ 0, 1 ], // CHECK-NEXT: outputs: [ 2, 3 ], -// CHECK-NEXT: custom_options: [ 99, 111, 110, 100, 95, 115, 117, 98, 103, 114, 97, 112, 104, 95, 105, 110, 100, 101, 120, 0, 98, 111, 100, 121, 95, 115, 117, 98, 103, 114, 97, 112, 104, 95, 105, 110, 100, 101, 120, 0, 2, 21, 42, 2, 1, 2, 2, 1, 4, 4, 4, 36, 1 ] -// CHECK-NEXT: } ] +// CHECK-NEXT: builtin_options_type: WhileOptions, +// CHECK-NEXT: builtin_options: { +// CHECK-NEXT: cond_subgraph_index: 1, +// CHECK-NEXT: body_subgraph_index: 2 +// CHECK-NEXT: } +// CHECK-NEXT: } ], // CHECK-NEXT: name: "main" // CHECK-NEXT: }, { // CHECK-NEXT: tensors: [ { @@ -91,7 +94,7 @@ // CHECK-NEXT: opcode_index: 1, // CHECK-NEXT: inputs: [ 0, 2 ], // CHECK-NEXT: outputs: [ 3 ] -// CHECK-NEXT: } ] +// CHECK-NEXT: } ], // CHECK-NEXT: name: "cond" // CHECK-NEXT: }, { // CHECK-NEXT: tensors: [ { @@ -151,7 +154,7 @@ // CHECK-NEXT: builtin_options: { // CHECK-EMPTY: // CHECK-NEXT: } -// CHECK-NEXT: } ] +// CHECK-NEXT: } ], // CHECK-NEXT: name: "body" // CHECK-NEXT: } ], // CHECK-NEXT: description: "MLIR Converted.", diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h index 1ed7022fc02..785853f2db1 100644 --- a/tensorflow/lite/builtin_ops.h +++ b/tensorflow/lite/builtin_ops.h @@ -143,6 +143,8 @@ typedef enum { kTfLiteBuiltinMatrixSetDiag = 115, kTfLiteBuiltinRound = 116, kTfLiteBuiltinHardSwish = 117, + kTfLiteBuiltinIf = 118, + kTfLiteBuiltinWhile = 119, } TfLiteBuiltinOperator; #ifdef __cplusplus diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h index 283d15de67b..00ed17d5a04 100644 --- a/tensorflow/lite/c/builtin_op_data.h +++ b/tensorflow/lite/c/builtin_op_data.h @@ -391,6 +391,16 @@ typedef struct { EmptyStructPlaceholder placeholder; } TfLiteMatrixSetDiagParams; +typedef struct { + int then_subgraph_index; + int else_subgraph_index; +} TfLiteIfParams; + +typedef struct { + int cond_subgraph_index; + int body_subgraph_index; +} TfLiteWhileParams; + #ifdef __cplusplus } // extern "C" #endif // __cplusplus diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc index a0f97da58ce..53a4e8fcc5a 100644 --- a/tensorflow/lite/core/api/flatbuffer_conversions.cc +++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc @@ -721,6 +721,24 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type, *builtin_data = reinterpret_cast(params.release()); break; } + case BuiltinOperator_IF: { + TfLiteIfParams* params = allocator->AllocatePOD(); + if (const auto* if_params = op->builtin_options_as_IfOptions()) { + params->then_subgraph_index = if_params->then_subgraph_index(); + params->else_subgraph_index = if_params->else_subgraph_index(); + } + *builtin_data = reinterpret_cast(params); + break; + } + case BuiltinOperator_WHILE: { + TfLiteWhileParams* params = allocator->AllocatePOD(); + if (const auto* while_params = op->builtin_options_as_WhileOptions()) { + params->cond_subgraph_index = while_params->cond_subgraph_index(); + params->body_subgraph_index = while_params->body_subgraph_index(); + } + *builtin_data = reinterpret_cast(params); + break; + } // Below are the ops with no builtin_data structure. case BuiltinOperator_ABS: case BuiltinOperator_BATCH_TO_SPACE_ND: diff --git a/tensorflow/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc index 2ea105f4127..cdb1372b929 100644 --- a/tensorflow/lite/experimental/writer/option_writer_generator.cc +++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc @@ -40,6 +40,7 @@ static const char* param_structs[] = {"TfLiteAddParams", "TfLiteFakeQuantParams", "TfLiteFullyConnectedParams", "TfLiteGatherParams", + "TfLiteIfParams", "TfLiteL2NormParams", "TfLiteLeakyReluParams", "TfLiteLocalResponseNormParams", @@ -76,6 +77,7 @@ static const char* param_structs[] = {"TfLiteAddParams", "TfLiteUniqueParams", "TfLiteUnpackParams", "TfLiteReverseSequenceParams", + "TfLiteWhileParams", nullptr}; } // namespace diff --git a/tensorflow/lite/kernels/if.cc b/tensorflow/lite/kernels/if.cc index 1bd394e9800..610af8cd4b9 100644 --- a/tensorflow/lite/kernels/if.cc +++ b/tensorflow/lite/kernels/if.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "flatbuffers/flexbuffers.h" // TF:flatbuffers + +#include + #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/c_api_internal.h" #include "tensorflow/lite/core/subgraph.h" @@ -30,10 +32,9 @@ struct OpData { void* Init(TfLiteContext* context, const char* buffer, size_t length) { auto* op_data = new OpData; - const uint8_t* buffer_t = reinterpret_cast(buffer); - const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap(); - op_data->then_subgraph_index = m["then_subgraph_index"].AsInt32(); - op_data->else_subgraph_index = m["else_subgraph_index"].AsInt32(); + const auto* params = reinterpret_cast(buffer); + op_data->then_subgraph_index = params->then_subgraph_index; + op_data->else_subgraph_index = params->else_subgraph_index; return op_data; } diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc index bd2643aaa64..6832ac73f6d 100644 --- a/tensorflow/lite/kernels/register.cc +++ b/tensorflow/lite/kernels/register.cc @@ -381,6 +381,10 @@ BuiltinOpResolver::BuiltinOpResolver() { /* max_version */ 2); AddBuiltin(BuiltinOperator_MATRIX_SET_DIAG, Register_MATRIX_SET_DIAG()); + // WARNING: Control flow ops are experimental and subject to change. + AddBuiltin(BuiltinOperator_IF, tflite::ops::custom::Register_IF()); + AddBuiltin(BuiltinOperator_WHILE, tflite::ops::custom::Register_WHILE()); + // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that // custom ops aren't always included by default. AddCustom("Mfcc", tflite::ops::custom::Register_MFCC()); @@ -388,10 +392,6 @@ BuiltinOpResolver::BuiltinOpResolver() { tflite::ops::custom::Register_AUDIO_SPECTROGRAM()); AddCustom("TFLite_Detection_PostProcess", tflite::ops::custom::Register_DETECTION_POSTPROCESS()); - - // WARNING: Control flow ops are experimental and subject to change. - AddCustom("Experimental_If", tflite::ops::custom::Register_IF()); - AddCustom("Experimental_While", tflite::ops::custom::Register_WHILE()); } } // namespace builtin diff --git a/tensorflow/lite/kernels/subgraph_test_util.cc b/tensorflow/lite/kernels/subgraph_test_util.cc index e55965ecf94..b60bdab080d 100644 --- a/tensorflow/lite/kernels/subgraph_test_util.cc +++ b/tensorflow/lite/kernels/subgraph_test_util.cc @@ -170,18 +170,14 @@ void SubgraphBuilder::BuildIfSubgraph(Subgraph* subgraph) { SetupTensor(subgraph, kInput2, kTfLiteInt32); SetupTensor(subgraph, kOutput, kTfLiteInt32); - flexbuffers::Builder fbb; - fbb.Map([&]() { - fbb.Int("then_subgraph_index", 1); - fbb.Int("else_subgraph_index", 2); - }); - fbb.Finish(); - const auto& buffer = fbb.GetBuffer(); + TfLiteIfParams* params = + reinterpret_cast(malloc(sizeof(TfLiteIfParams))); + params->then_subgraph_index = 1; + params->else_subgraph_index = 2; int node_index; subgraph->AddNodeWithParameters( - {kCondInput, kInput1, kInput2}, {kOutput}, {}, - reinterpret_cast(buffer.data()), buffer.size(), nullptr, + {kCondInput, kInput1, kInput2}, {kOutput}, {}, nullptr, 0, params, ::tflite::ops::custom::Register_IF(), &node_index); } @@ -333,19 +329,15 @@ void SubgraphBuilder::BuildWhileSubgraph(Subgraph* subgraph) { SetupTensor(subgraph, kOutput1, kTfLiteInt32); SetupTensor(subgraph, kOutput2, kTfLiteInt32); - flexbuffers::Builder fbb; - fbb.Map([&]() { - fbb.Int("cond_subgraph_index", 1); - fbb.Int("body_subgraph_index", 2); - }); - fbb.Finish(); - const auto& buffer = fbb.GetBuffer(); + TfLiteWhileParams* params = + reinterpret_cast(malloc(sizeof(TfLiteWhileParams))); + params->cond_subgraph_index = 1; + params->body_subgraph_index = 2; int node_index; - subgraph->AddNodeWithParameters( - {0, 1}, {2, 3}, {}, reinterpret_cast(buffer.data()), - buffer.size(), nullptr, ::tflite::ops::custom::Register_WHILE(), - &node_index); + subgraph->AddNodeWithParameters({0, 1}, {2, 3}, {}, nullptr, 0, params, + ::tflite::ops::custom::Register_WHILE(), + &node_index); } void SubgraphBuilder::CreateConstantInt32Tensor(Subgraph* subgraph, diff --git a/tensorflow/lite/kernels/while.cc b/tensorflow/lite/kernels/while.cc index a6438558458..6ac1d4b1e91 100644 --- a/tensorflow/lite/kernels/while.cc +++ b/tensorflow/lite/kernels/while.cc @@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "flatbuffers/flexbuffers.h" // TF:flatbuffers + +#include + #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/c_api_internal.h" #include "tensorflow/lite/context_util.h" @@ -107,10 +109,9 @@ struct OpData { void* Init(TfLiteContext* context, const char* buffer, size_t length) { auto* op_data = new OpData; - const uint8_t* buffer_t = reinterpret_cast(buffer); - const flexbuffers::Map& m = flexbuffers::GetRoot(buffer_t, length).AsMap(); - op_data->cond_subgraph_index = m["cond_subgraph_index"].AsInt32(); - op_data->body_subgraph_index = m["body_subgraph_index"].AsInt32(); + const auto* params = reinterpret_cast(buffer); + op_data->cond_subgraph_index = params->cond_subgraph_index; + op_data->body_subgraph_index = params->body_subgraph_index; op_data->cond_has_dynamic_output_tensors = false; op_data->body_has_dynamic_output_tensors = false; return op_data; diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs index 65c7156f0d3..b82bbdfd103 100644 --- a/tensorflow/lite/schema/schema.fbs +++ b/tensorflow/lite/schema/schema.fbs @@ -231,6 +231,8 @@ enum BuiltinOperator : byte { MATRIX_SET_DIAG = 115, ROUND = 116, HARD_SWISH = 117, + IF = 118, + WHILE = 119, } // Options for the builtin operators. @@ -325,7 +327,9 @@ union BuiltinOptions { MatrixDiagOptions, QuantizeOptions, MatrixSetDiagOptions, - HardSwishOptions + HardSwishOptions, + IfOptions, + WhileOptions } enum Padding : byte { SAME, VALID } @@ -783,6 +787,16 @@ table QuantizeOptions { table MatrixSetDiagOptions { } +table IfOptions { + then_subgraph_index:int; + else_subgraph_index:int; +} + +table WhileOptions { + cond_subgraph_index:int; + body_subgraph_index:int; +} + // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a // builtin, or a string if the operator is custom. table OperatorCode { diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h index abe1f3f9a4a..07d554444b0 100755 --- a/tensorflow/lite/schema/schema_generated.h +++ b/tensorflow/lite/schema/schema_generated.h @@ -304,6 +304,12 @@ struct QuantizeOptionsT; struct MatrixSetDiagOptions; struct MatrixSetDiagOptionsT; +struct IfOptions; +struct IfOptionsT; + +struct WhileOptions; +struct WhileOptionsT; + struct OperatorCode; struct OperatorCodeT; @@ -577,11 +583,13 @@ enum BuiltinOperator { BuiltinOperator_MATRIX_SET_DIAG = 115, BuiltinOperator_ROUND = 116, BuiltinOperator_HARD_SWISH = 117, + BuiltinOperator_IF = 118, + BuiltinOperator_WHILE = 119, BuiltinOperator_MIN = BuiltinOperator_ADD, - BuiltinOperator_MAX = BuiltinOperator_HARD_SWISH + BuiltinOperator_MAX = BuiltinOperator_WHILE }; -inline const BuiltinOperator (&EnumValuesBuiltinOperator())[117] { +inline const BuiltinOperator (&EnumValuesBuiltinOperator())[119] { static const BuiltinOperator values[] = { BuiltinOperator_ADD, BuiltinOperator_AVERAGE_POOL_2D, @@ -699,7 +707,9 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[117] { BuiltinOperator_QUANTIZE, BuiltinOperator_MATRIX_SET_DIAG, BuiltinOperator_ROUND, - BuiltinOperator_HARD_SWISH + BuiltinOperator_HARD_SWISH, + BuiltinOperator_IF, + BuiltinOperator_WHILE }; return values; } @@ -824,13 +834,15 @@ inline const char * const *EnumNamesBuiltinOperator() { "MATRIX_SET_DIAG", "ROUND", "HARD_SWISH", + "IF", + "WHILE", nullptr }; return names; } inline const char *EnumNameBuiltinOperator(BuiltinOperator e) { - if (e < BuiltinOperator_ADD || e > BuiltinOperator_HARD_SWISH) return ""; + if (e < BuiltinOperator_ADD || e > BuiltinOperator_WHILE) return ""; const size_t index = static_cast(e); return EnumNamesBuiltinOperator()[index]; } @@ -928,11 +940,13 @@ enum BuiltinOptions { BuiltinOptions_QuantizeOptions = 89, BuiltinOptions_MatrixSetDiagOptions = 90, BuiltinOptions_HardSwishOptions = 91, + BuiltinOptions_IfOptions = 92, + BuiltinOptions_WhileOptions = 93, BuiltinOptions_MIN = BuiltinOptions_NONE, - BuiltinOptions_MAX = BuiltinOptions_HardSwishOptions + BuiltinOptions_MAX = BuiltinOptions_WhileOptions }; -inline const BuiltinOptions (&EnumValuesBuiltinOptions())[92] { +inline const BuiltinOptions (&EnumValuesBuiltinOptions())[94] { static const BuiltinOptions values[] = { BuiltinOptions_NONE, BuiltinOptions_Conv2DOptions, @@ -1025,7 +1039,9 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[92] { BuiltinOptions_MatrixDiagOptions, BuiltinOptions_QuantizeOptions, BuiltinOptions_MatrixSetDiagOptions, - BuiltinOptions_HardSwishOptions + BuiltinOptions_HardSwishOptions, + BuiltinOptions_IfOptions, + BuiltinOptions_WhileOptions }; return values; } @@ -1124,13 +1140,15 @@ inline const char * const *EnumNamesBuiltinOptions() { "QuantizeOptions", "MatrixSetDiagOptions", "HardSwishOptions", + "IfOptions", + "WhileOptions", nullptr }; return names; } inline const char *EnumNameBuiltinOptions(BuiltinOptions e) { - if (e < BuiltinOptions_NONE || e > BuiltinOptions_HardSwishOptions) return ""; + if (e < BuiltinOptions_NONE || e > BuiltinOptions_WhileOptions) return ""; const size_t index = static_cast(e); return EnumNamesBuiltinOptions()[index]; } @@ -1503,6 +1521,14 @@ template<> struct BuiltinOptionsTraits { static const BuiltinOptions enum_value = BuiltinOptions_HardSwishOptions; }; +template<> struct BuiltinOptionsTraits { + static const BuiltinOptions enum_value = BuiltinOptions_IfOptions; +}; + +template<> struct BuiltinOptionsTraits { + static const BuiltinOptions enum_value = BuiltinOptions_WhileOptions; +}; + struct BuiltinOptionsUnion { BuiltinOptions type; void *value; @@ -2263,6 +2289,22 @@ struct BuiltinOptionsUnion { return type == BuiltinOptions_HardSwishOptions ? reinterpret_cast(value) : nullptr; } + IfOptionsT *AsIfOptions() { + return type == BuiltinOptions_IfOptions ? + reinterpret_cast(value) : nullptr; + } + const IfOptionsT *AsIfOptions() const { + return type == BuiltinOptions_IfOptions ? + reinterpret_cast(value) : nullptr; + } + WhileOptionsT *AsWhileOptions() { + return type == BuiltinOptions_WhileOptions ? + reinterpret_cast(value) : nullptr; + } + const WhileOptionsT *AsWhileOptions() const { + return type == BuiltinOptions_WhileOptions ? + reinterpret_cast(value) : nullptr; + } }; bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type); @@ -7856,6 +7898,138 @@ inline flatbuffers::Offset CreateMatrixSetDiagOptions( flatbuffers::Offset CreateMatrixSetDiagOptions(flatbuffers::FlatBufferBuilder &_fbb, const MatrixSetDiagOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +struct IfOptionsT : public flatbuffers::NativeTable { + typedef IfOptions TableType; + int32_t then_subgraph_index; + int32_t else_subgraph_index; + IfOptionsT() + : then_subgraph_index(0), + else_subgraph_index(0) { + } +}; + +struct IfOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef IfOptionsT NativeTableType; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_THEN_SUBGRAPH_INDEX = 4, + VT_ELSE_SUBGRAPH_INDEX = 6 + }; + int32_t then_subgraph_index() const { + return GetField(VT_THEN_SUBGRAPH_INDEX, 0); + } + int32_t else_subgraph_index() const { + return GetField(VT_ELSE_SUBGRAPH_INDEX, 0); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_THEN_SUBGRAPH_INDEX) && + VerifyField(verifier, VT_ELSE_SUBGRAPH_INDEX) && + verifier.EndTable(); + } + IfOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(IfOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct IfOptionsBuilder { + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_then_subgraph_index(int32_t then_subgraph_index) { + fbb_.AddElement(IfOptions::VT_THEN_SUBGRAPH_INDEX, then_subgraph_index, 0); + } + void add_else_subgraph_index(int32_t else_subgraph_index) { + fbb_.AddElement(IfOptions::VT_ELSE_SUBGRAPH_INDEX, else_subgraph_index, 0); + } + explicit IfOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + IfOptionsBuilder &operator=(const IfOptionsBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateIfOptions( + flatbuffers::FlatBufferBuilder &_fbb, + int32_t then_subgraph_index = 0, + int32_t else_subgraph_index = 0) { + IfOptionsBuilder builder_(_fbb); + builder_.add_else_subgraph_index(else_subgraph_index); + builder_.add_then_subgraph_index(then_subgraph_index); + return builder_.Finish(); +} + +flatbuffers::Offset CreateIfOptions(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + +struct WhileOptionsT : public flatbuffers::NativeTable { + typedef WhileOptions TableType; + int32_t cond_subgraph_index; + int32_t body_subgraph_index; + WhileOptionsT() + : cond_subgraph_index(0), + body_subgraph_index(0) { + } +}; + +struct WhileOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef WhileOptionsT NativeTableType; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_COND_SUBGRAPH_INDEX = 4, + VT_BODY_SUBGRAPH_INDEX = 6 + }; + int32_t cond_subgraph_index() const { + return GetField(VT_COND_SUBGRAPH_INDEX, 0); + } + int32_t body_subgraph_index() const { + return GetField(VT_BODY_SUBGRAPH_INDEX, 0); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_COND_SUBGRAPH_INDEX) && + VerifyField(verifier, VT_BODY_SUBGRAPH_INDEX) && + verifier.EndTable(); + } + WhileOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(WhileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct WhileOptionsBuilder { + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_cond_subgraph_index(int32_t cond_subgraph_index) { + fbb_.AddElement(WhileOptions::VT_COND_SUBGRAPH_INDEX, cond_subgraph_index, 0); + } + void add_body_subgraph_index(int32_t body_subgraph_index) { + fbb_.AddElement(WhileOptions::VT_BODY_SUBGRAPH_INDEX, body_subgraph_index, 0); + } + explicit WhileOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + WhileOptionsBuilder &operator=(const WhileOptionsBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateWhileOptions( + flatbuffers::FlatBufferBuilder &_fbb, + int32_t cond_subgraph_index = 0, + int32_t body_subgraph_index = 0) { + WhileOptionsBuilder builder_(_fbb); + builder_.add_body_subgraph_index(body_subgraph_index); + builder_.add_cond_subgraph_index(cond_subgraph_index); + return builder_.Finish(); +} + +flatbuffers::Offset CreateWhileOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + struct OperatorCodeT : public flatbuffers::NativeTable { typedef OperatorCode TableType; BuiltinOperator builtin_code; @@ -8265,6 +8439,12 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { const HardSwishOptions *builtin_options_as_HardSwishOptions() const { return builtin_options_type() == BuiltinOptions_HardSwishOptions ? static_cast(builtin_options()) : nullptr; } + const IfOptions *builtin_options_as_IfOptions() const { + return builtin_options_type() == BuiltinOptions_IfOptions ? static_cast(builtin_options()) : nullptr; + } + const WhileOptions *builtin_options_as_WhileOptions() const { + return builtin_options_type() == BuiltinOptions_WhileOptions ? static_cast(builtin_options()) : nullptr; + } const flatbuffers::Vector *custom_options() const { return GetPointer *>(VT_CUSTOM_OPTIONS); } @@ -8665,6 +8845,14 @@ template<> inline const HardSwishOptions *Operator::builtin_options_as inline const IfOptions *Operator::builtin_options_as() const { + return builtin_options_as_IfOptions(); +} + +template<> inline const WhileOptions *Operator::builtin_options_as() const { + return builtin_options_as_WhileOptions(); +} + struct OperatorBuilder { flatbuffers::FlatBufferBuilder &fbb_; flatbuffers::uoffset_t start_; @@ -11690,6 +11878,64 @@ inline flatbuffers::Offset CreateMatrixSetDiagOptions(flat _fbb); } +inline IfOptionsT *IfOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + auto _o = new IfOptionsT(); + UnPackTo(_o, _resolver); + return _o; +} + +inline void IfOptions::UnPackTo(IfOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; + { auto _e = then_subgraph_index(); _o->then_subgraph_index = _e; }; + { auto _e = else_subgraph_index(); _o->else_subgraph_index = _e; }; +} + +inline flatbuffers::Offset IfOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateIfOptions(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateIfOptions(flatbuffers::FlatBufferBuilder &_fbb, const IfOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const IfOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + auto _then_subgraph_index = _o->then_subgraph_index; + auto _else_subgraph_index = _o->else_subgraph_index; + return tflite::CreateIfOptions( + _fbb, + _then_subgraph_index, + _else_subgraph_index); +} + +inline WhileOptionsT *WhileOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + auto _o = new WhileOptionsT(); + UnPackTo(_o, _resolver); + return _o; +} + +inline void WhileOptions::UnPackTo(WhileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; + { auto _e = cond_subgraph_index(); _o->cond_subgraph_index = _e; }; + { auto _e = body_subgraph_index(); _o->body_subgraph_index = _e; }; +} + +inline flatbuffers::Offset WhileOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateWhileOptions(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateWhileOptions(flatbuffers::FlatBufferBuilder &_fbb, const WhileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const WhileOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + auto _cond_subgraph_index = _o->cond_subgraph_index; + auto _body_subgraph_index = _o->body_subgraph_index; + return tflite::CreateWhileOptions( + _fbb, + _cond_subgraph_index, + _body_subgraph_index); +} + inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const { auto _o = new OperatorCodeT(); UnPackTo(_o, _resolver); @@ -12347,6 +12593,14 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob auto ptr = reinterpret_cast(obj); return verifier.VerifyTable(ptr); } + case BuiltinOptions_IfOptions: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } + case BuiltinOptions_WhileOptions: { + auto ptr = reinterpret_cast(obj); + return verifier.VerifyTable(ptr); + } default: return false; } } @@ -12729,6 +12983,14 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c auto ptr = reinterpret_cast(obj); return ptr->UnPack(resolver); } + case BuiltinOptions_IfOptions: { + auto ptr = reinterpret_cast(obj); + return ptr->UnPack(resolver); + } + case BuiltinOptions_WhileOptions: { + auto ptr = reinterpret_cast(obj); + return ptr->UnPack(resolver); + } default: return nullptr; } } @@ -13099,6 +13361,14 @@ inline flatbuffers::Offset BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff auto ptr = reinterpret_cast(value); return CreateHardSwishOptions(_fbb, ptr, _rehasher).Union(); } + case BuiltinOptions_IfOptions: { + auto ptr = reinterpret_cast(value); + return CreateIfOptions(_fbb, ptr, _rehasher).Union(); + } + case BuiltinOptions_WhileOptions: { + auto ptr = reinterpret_cast(value); + return CreateWhileOptions(_fbb, ptr, _rehasher).Union(); + } default: return 0; } } @@ -13469,6 +13739,14 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL value = new HardSwishOptionsT(*reinterpret_cast(u.value)); break; } + case BuiltinOptions_IfOptions: { + value = new IfOptionsT(*reinterpret_cast(u.value)); + break; + } + case BuiltinOptions_WhileOptions: { + value = new WhileOptionsT(*reinterpret_cast(u.value)); + break; + } default: break; } @@ -13931,6 +14209,16 @@ inline void BuiltinOptionsUnion::Reset() { delete ptr; break; } + case BuiltinOptions_IfOptions: { + auto ptr = reinterpret_cast(value); + delete ptr; + break; + } + case BuiltinOptions_WhileOptions: { + auto ptr = reinterpret_cast(value); + delete ptr; + break; + } default: break; } value = nullptr; From 32c1a612efe2a23df09b3fb8d01dabeaa635be0d Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Sat, 20 Jul 2019 15:50:51 -0700 Subject: [PATCH 0243/3053] Fix typo in comment. PiperOrigin-RevId: 259150770 --- tensorflow/compiler/xla/primitive_util.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h index 295d3530032..034c14e8930 100644 --- a/tensorflow/compiler/xla/primitive_util.h +++ b/tensorflow/compiler/xla/primitive_util.h @@ -45,7 +45,7 @@ const int kBFloat16MantissaBits = 7; template PrimitiveType NativeToPrimitiveType() { // Make the expression depend on the template parameter NativeT so - // that this compile-time error only apperas if this function is + // that this compile-time error only appears if this function is // instantiated with some concrete type that is not specialized // below. static_assert(!std::is_same::value, From 20562226f041a76433c10875bd0924a6267b2196 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Sat, 20 Jul 2019 16:11:48 -0700 Subject: [PATCH 0244/3053] Add missing namespace specification for string. PiperOrigin-RevId: 259152069 --- tensorflow/compiler/mlir/xla/operator_writer_gen.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc index 0fb315b90f9..67c807ee4c4 100644 --- a/tensorflow/compiler/mlir/xla/operator_writer_gen.cc +++ b/tensorflow/compiler/mlir/xla/operator_writer_gen.cc @@ -51,8 +51,8 @@ static std::string GetConversionFunction( return "Convert_" + named_attr.name.str(); } -using ArgumentName = string; -using ArgumentDeclaration = string; +using ArgumentName = std::string; +using ArgumentDeclaration = std::string; using Argument = std::pair; using ArgumentList = std::vector; From b0bc0ead62d14d22210f3d6241e3444a11565486 Mon Sep 17 00:00:00 2001 From: Daniel Situnayake Date: Sat, 20 Jul 2019 19:30:23 -0700 Subject: [PATCH 0245/3053] TensorFlow Lite for Microcontrollers docs update PiperOrigin-RevId: 259163029 --- .../micro/examples/hello_world/README.md | 12 +- .../g3doc/microcontrollers/build_convert.md | 5 + .../g3doc/microcontrollers/get_started.md | 140 +++++++++--------- 3 files changed, 83 insertions(+), 74 deletions(-) diff --git a/tensorflow/lite/experimental/micro/examples/hello_world/README.md b/tensorflow/lite/experimental/micro/examples/hello_world/README.md index 1de9730848c..e0b593fb4d3 100644 --- a/tensorflow/lite/experimental/micro/examples/hello_world/README.md +++ b/tensorflow/lite/experimental/micro/examples/hello_world/README.md @@ -32,11 +32,17 @@ Microcontrollers. ### Build the code -To compile and test this example on a desktop Linux or MacOS machine, download -[the TensorFlow source code](https://github.com/tensorflow/tensorflow), `cd` -into the source directory from a terminal, and then run the following command: +To compile and test this example on a desktop Linux or macOS machine, first +clone the TensorFlow repository from GitHub to a convenient place: +```bash +git clone --depth 1 https://github.com/tensorflow/tensorflow.git ``` + +Next, `cd` into the source directory from a terminal, and then run the following +command: + +```bash make -f tensorflow/lite/experimental/micro/tools/make/Makefile test_hello_world_test ``` diff --git a/tensorflow/lite/g3doc/microcontrollers/build_convert.md b/tensorflow/lite/g3doc/microcontrollers/build_convert.md index 9c402c568e1..1bac76925ce 100644 --- a/tensorflow/lite/g3doc/microcontrollers/build_convert.md +++ b/tensorflow/lite/g3doc/microcontrollers/build_convert.md @@ -9,6 +9,11 @@ This document explains the process of converting a TensorFlow model to run on microcontrollers. It also outlines the supported operations and gives some guidance on designing and training a model to fit in limited memory. +For an end-to-end, runnable example of building and converting a model, see the +following Jupyter notebook: + +create_sine_model.ipynb + ## Model conversion To convert a trained TensorFlow model to run on microcontrollers, you should use diff --git a/tensorflow/lite/g3doc/microcontrollers/get_started.md b/tensorflow/lite/g3doc/microcontrollers/get_started.md index f5afa01f160..9b126b5c02e 100644 --- a/tensorflow/lite/g3doc/microcontrollers/get_started.md +++ b/tensorflow/lite/g3doc/microcontrollers/get_started.md @@ -3,12 +3,54 @@ This document will help you start working with TensorFlow Lite for Microcontrollers. -## Sample code +Start by reading through and running our [Examples](#examples). -To get started, you can explore the following example: +Note: If you need a device to get started, we recommend the +[SparkFun Edge Powered by TensorFlow](https://www.sparkfun.com/products/15170). +It was designed in conjunction with the TensorFlow Lite team to offer a flexible +platform for experimenting with deep learning on microcontrollers. -Micro -speech example +For a walkthrough of the code required to run inference, see the *Run inference* +section below. + +## Examples + +There are several examples that demonstrate how to build embedded machine +learning applications with TensorFlow Lite: + +### Hello World example + +This example is designed to demonstrate the absolute basics of using TensorFlow +Lite for Microcontrollers. It includes the full end-to-end workflow of training +a model, converting it for use with TensorFlow Lite, and running inference on a +microcontroller. + +In the example, a model is trained to replicate a sine function. When deployed +to a microcontroller, its predictions are used to either blink LEDs or control +an animation. + +Hello +World example + +The example code includes a Jupyter notebook that demonstrates how the model is +trained and converted: + +create_sine_model.ipynb + +The process of building and converting a model is also covered in the guide +[Build and convert models](build_convert.md). + +To see how inference is performed, take a look at +[hello_world_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/hello_world/hello_world_test.cc). + +The example is tested on the following platforms: + +- [SparkFun Edge Powered by TensorFlow (Apollo3 Blue)](https://www.sparkfun.com/products/15170) +- [Arduino MKRZERO](https://store.arduino.cc/usa/arduino-mkrzero) +- [STM32F746G Discovery Board](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html) +- Mac OS X + +### Micro Speech example This example uses a simple [audio recognition model](https://www.tensorflow.org/tutorials/sequences/audio_recognition) @@ -16,48 +58,43 @@ to identify keywords in speech. The sample code captures audio from a device's microphones. The model classifies this audio in real time, determining whether the word "yes" or "no" has been spoken. -The sample works end-to-end (including audio capture and inference) on the -following platforms: +Micro +Speech example + +The [Run inference](#run_inference) section walks through the code of the Micro +Speech sample and explains how it works. + +The example is tested on the following platforms: - [SparkFun Edge Powered by TensorFlow (Apollo3 Blue)](https://www.sparkfun.com/products/15170) - [STM32F746G Discovery Board](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html) - Mac OS X -### SparkFun Edge - -If you need a device to get started, we recommend the -[SparkFun Edge Powered by TensorFlow](https://www.sparkfun.com/products/15170). -It was designed in conjunction with the TensorFlow Lite team to offer a flexible -platform for experimenting with deep learning on microcontrollers. - -To get started using the Edge board, we recommend following +Note: To get started using the SparkFun Edge board, we recommend following [Machine learning on a microcontroller with SparkFun TensorFlow](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow), -a codelab that introduces you to the development workflow. +a codelab that introduces you to the development workflow using the Micro Speech +example. -## Workflow +### Micro Vision example -Using TensorFlow Lite for Microcontrollers involves four major steps: +This example shows how you can use TensorFlow Lite to run a 250 kilobyte neural +network to recognize people in images captured by a camera. It is designed to +run on systems with small amounts of memory such as microcontrollers and DSPs. -1. Create or find a model architecture. -2. Train a model. -3. Convert the model. -4. Write code to run inference. +Micro +Vision example -The first three steps are covered in the guide -[Build and convert models](build_convert.md). The sample code comes with a -pretrained model, and includes scripts to train a model that recognizes -different spoken words. Instructions on training are in -[README.md](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/README.md#creating-your-own-model). +The example is tested on the following platforms: -In this document, we will focus on the code that will feed processed audio data -into the model and execute it, resulting in a prediction of which word was -spoken. This process is called *inference*. +- [SparkFun Edge Powered by TensorFlow (Apollo3 Blue)](https://www.sparkfun.com/products/15170) +- [STM32F746G Discovery Board](https://www.st.com/en/evaluation-tools/32f746gdiscovery.html) +- Mac OS X ## Run inference -The sample's +The following section walks through the [Micro Speech](#micro_speech) sample's [main.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc) -contains the code that runs inference. We'll now walk through the key parts. +and explains how it used TensorFlow Lite for Microcontrollers to run inference. ### Includes @@ -277,48 +314,9 @@ recognition results across a number of frames. This is defined in The same technique can be used to improve reliability when processing any continuous stream of data. -## Build the sample - -The sample contains build scripts that will download all required dependencies -and compile a binary that can be run on a device. - -Note: The build process has been tested on MacOS and Linux, but not on Windows. - -To build the sample, take the following steps: - -1. Clone the TensorFlow repository from GitHub to a convenient place. - - ```bash - git clone --depth 1 https://github.com/tensorflow/tensorflow.git - ``` - -1. Enter the directory that was created in the previous step. - - ```bash - cd tensorflow - ``` - -1. If you are using MacOS, run the following command. If you are using Linux, - you do not need to do this. - - ```bash - PATH=tensorflow/lite/experimental/micro/tools/make/downloads/gcc_embedded/bin/:$PATH - ``` - -1. To download all of the required dependencies and initiate the build process, - issue the following command. You can set `TARGET` depending on which - platform you want to build for. Explore - [`targets/`](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/tools/make/targets) - for the current options. - - ```bash - make -f tensorflow/lite/experimental/micro/tools/make/Makefile - TARGET=sparkfun_edge micro_speech_bin - ``` - ## Next steps -Once you have built and run the sample, read the following documents: +Once you have built and run the samples, read the following documents: * Learn how to work with models in [Build and convert models](build_convert.md). From bcd5fa6f4f29407be080bd6576291f47fbbe779d Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Sat, 20 Jul 2019 19:46:18 -0700 Subject: [PATCH 0246/3053] Add missing header. PiperOrigin-RevId: 259163944 --- tensorflow/compiler/mlir/xla/BUILD | 1 + tensorflow/compiler/mlir/xla/hlo_function_importer.h | 1 + 2 files changed, 2 insertions(+) diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD index fd1aa690fff..fe4d7e3019d 100644 --- a/tensorflow/compiler/mlir/xla/BUILD +++ b/tensorflow/compiler/mlir/xla/BUILD @@ -257,6 +257,7 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla:xla_proto", "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/core:lib", "@llvm//:support", "@local_config_mlir//:IR", "@local_config_mlir//:StandardOps", diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.h b/tensorflow/compiler/mlir/xla/hlo_function_importer.h index ee321432f4d..c1f091a08cd 100644 --- a/tensorflow/compiler/mlir/xla/hlo_function_importer.h +++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.h @@ -28,6 +28,7 @@ limitations under the License. #include "tensorflow/compiler/xla/status.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/platform/types.h" namespace xla { From 9d4653a829546eef15fbed0d6c8215fe436573c6 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Sat, 20 Jul 2019 20:01:50 -0700 Subject: [PATCH 0247/3053] Use TF protobuf library directly instead. PiperOrigin-RevId: 259164732 --- tensorflow/compiler/mlir/xla/xla_mlir_translate.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc b/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc index 9804858c084..2c4bddd2d8e 100644 --- a/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc +++ b/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc @@ -15,7 +15,6 @@ limitations under the License. #include "tensorflow/compiler/mlir/xla/xla_mlir_translate.h" -#include "google/protobuf/text_format.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/ToolOutputFile.h" #include "mlir/IR/Module.h" // TF:local_config_mlir @@ -26,6 +25,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/hlo_parser.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/protobuf.h" using stream_executor::port::Status; using stream_executor::port::StatusOr; // NOLINT TODO(b/130822468) fix this @@ -34,13 +34,13 @@ namespace xla { namespace { // Error collector that simply ignores errors reported. -class NoOpErrorCollector : public ::proto2::io::ErrorCollector { +class NoOpErrorCollector : public tensorflow::protobuf::io::ErrorCollector { public: void AddError(int line, int column, const string& message) override {} }; bool LoadHloProto(const std::string& contents, HloProto* hlo_proto) { - ::proto2::TextFormat::Parser parser; + tensorflow::protobuf::TextFormat::Parser parser; NoOpErrorCollector collector; parser.RecordErrorsTo(&collector); return hlo_proto->ParseFromString(contents) || From 97b7aa03b7b2abd2fd6431b6c482dbb61a8d39cd Mon Sep 17 00:00:00 2001 From: Anjali Sridhar Date: Sat, 20 Jul 2019 22:14:15 -0700 Subject: [PATCH 0248/3053] Allow creation of iterators in graph mode when using experimental_make_datasets_from_function API. PiperOrigin-RevId: 259172008 --- .../python/distribute/distribute_lib.py | 9 +- .../python/distribute/distribute_lib_test.py | 10 +- tensorflow/python/distribute/input_lib.py | 150 +++++++++++++++--- .../python/distribute/mirrored_strategy.py | 2 +- .../distribute/mirrored_strategy_test.py | 2 +- .../python/distribute/one_device_strategy.py | 2 +- .../distribute/parameter_server_strategy.py | 2 +- .../python/distribute/strategy_test_lib.py | 48 +++--- tensorflow/python/distribute/tpu_strategy.py | 2 +- 9 files changed, 159 insertions(+), 68 deletions(-) diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py index 45dc7480869..a582c0f82b8 100644 --- a/tensorflow/python/distribute/distribute_lib.py +++ b/tensorflow/python/distribute/distribute_lib.py @@ -672,8 +672,6 @@ class Strategy(object): def experimental_distribute_datasets_from_function(self, dataset_fn): """Distributes `tf.data.Dataset` instances created by calls to `dataset_fn`. - Note: This API can only be used in eager mode. - `dataset_fn` will be called once for each worker in the strategy. Each replica on that worker will dequeue one batch of inputs from the local `Dataset` (i.e. if a worker has two replicas, two batches will be dequeued @@ -718,11 +716,8 @@ class Strategy(object): A "distributed `Dataset`", which acts like a `tf.data.Dataset` except it produces "per-replica" values. """ - if ops.executing_eagerly_outside_functions(): - return self._extended._experimental_distribute_datasets_from_function( # pylint: disable=protected-access - dataset_fn) - raise RuntimeError("`experimental_distribute_datasets_from_function` is " # pylint: disable=g-doc-exception - "only supported when eager execution is enabled.") + return self._extended._experimental_distribute_datasets_from_function( # pylint: disable=protected-access + dataset_fn) def experimental_run_v2(self, fn, args=(), kwargs=None): """Run `fn` on each replica, with the given arguments. diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py index 27db4261f8b..d0d14a7831e 100644 --- a/tensorflow/python/distribute/distribute_lib_test.py +++ b/tensorflow/python/distribute/distribute_lib_test.py @@ -500,12 +500,10 @@ class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase): self.assertAllEqual([0, 1], self.evaluate(next_val)) else: dataset_fn = lambda _: dataset_ops.DatasetV2.range(10).batch(2) - with self.assertRaisesRegexp(RuntimeError, - "only supported when eager execution is " - "enabled"): - dist_dataset_from_func = \ - default_strategy.experimental_distribute_datasets_from_function( - dataset_fn) + dist_dataset_from_func = \ + default_strategy.experimental_distribute_datasets_from_function( + dataset_fn) + dataset_ops.make_initializable_iterator(dist_dataset_from_func) class InputContextTest(test.TestCase): diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index 9822d223433..84b2351d4b1 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -88,6 +88,44 @@ def get_distributed_dataset(dataset, input_context=input_context) +def get_distributed_datasets_from_function(dataset_fn, + input_workers, + input_contexts, + strategy): + """Returns a wrapped tf.data.DatasetV1 or tf.data.DatasetV2 instance. + + This is a common function that is used by all strategies to return the right + tf.data.Dataset wrapped instance depending on if we are in graph or eager + mode. + + Args: + dataset_fn: a function that returns a tf.data.DatasetV1 or tf.data.DatasetV2 + instance. + input_workers: an InputWorkers object which specifies devices on which + iterators should be created. + input_contexts: A list of `InputContext` instances to be passed to call(s) + to `dataset_fn`. Length and order should match worker order in + `worker_device_pairs`. + strategy: a `tf.distribute.Strategy` object, used to run all-reduce to + handle last partial batch. + + Returns: + A wrapped tf.data.DatasetV1 or tf.data.DatasetV2 instance. + """ + if ops.executing_eagerly_outside_functions(): + return DistributedDatasetsFromFunction( + dataset_fn, + input_workers, + input_contexts, + strategy) + else: + return DistributedDatasetsFromFunctionV1( + dataset_fn, + input_workers, + input_contexts, + strategy) + + class InputWorkers(object): """A 1-to-many mapping from input worker devices to compute devices.""" @@ -478,20 +516,23 @@ class DistributedDataset(_IterableInput): self._cloned_datasets.append(cloned_dataset) self._input_workers = input_workers + # TODO(anjalisridhar): Identify if we need to set this property on the + # iterator. self.element_spec = dataset.element_spec self._strategy = strategy def __iter__(self): - if (context.executing_eagerly() or - ops.executing_eagerly_outside_functions()): - worker_iterators = _create_iterators_per_worker(self._cloned_datasets, - self._input_workers) - iterator = DistributedIterator(self._input_workers, worker_iterators, - self._strategy) - iterator.element_spec = self.element_spec - return iterator - raise RuntimeError("__iter__() is only supported inside of tf.function " - "or when eager execution is enabled.") + if not (context.executing_eagerly() or + ops.get_default_graph().building_function): + raise RuntimeError("__iter__() is only supported inside of tf.function " + "or when eager execution is enabled.") + + worker_iterators = _create_iterators_per_worker(self._cloned_datasets, + self._input_workers) + iterator = DistributedIterator(self._input_workers, worker_iterators, + self._strategy) + iterator.element_spec = self.element_spec # pylint: disable=protected-access + return iterator class DistributedDatasetV1(DistributedDataset): @@ -512,7 +553,18 @@ class DistributedDatasetV1(DistributedDataset): input_context=input_context) def make_one_shot_iterator(self): - """Get a one time use iterator for DistributedDatasetV1.""" + """Get a one time use iterator for DistributedDatasetV1. + + Note: This API is deprecated. Please use `for ... in dataset:` to iterate + over the dataset or `iter` to create an iterator. + + Returns: + A DistributedIteratorV1 instance. + """ + return self._make_one_shot_iterator() + + def _make_one_shot_iterator(self): + """Get an iterator for DistributedDatasetV1.""" # Graph mode with one shot iterator is disabled because we have to call # `initialize` on the iterator which is only required if we are using a # tf.distribute strategy. @@ -522,12 +574,24 @@ class DistributedDatasetV1(DistributedDataset): return self._get_iterator() def make_initializable_iterator(self): + """Get an initializable iterator for DistributedDatasetV1. + + Note: This API is deprecated. Please use + `tf.compat.v1.data.make_initializable_iterator(dataset)` to create an + initializable iterator. + + Returns: + A DistributedIteratorV1 instance. + """ + return self._make_initializable_iterator() + + def _make_initializable_iterator(self, shared_name=None): # pylint: disable=unused-argument """Get an initializable iterator for DistributedDatasetV1.""" # Eager mode generates already initialized iterators. Hence we cannot create # an initializable iterator. if context.executing_eagerly(): raise ValueError("Cannot create initializable iterator in Eager mode. " - "Please use `make_one_shot_iterator` instead.") + "Please use `iter()` instead.") return self._get_iterator() def _get_iterator(self): @@ -535,7 +599,7 @@ class DistributedDatasetV1(DistributedDataset): self._input_workers) iterator = DistributedIteratorV1(self._input_workers, worker_iterators, self._strategy) - iterator.element_spec = self.element_spec + iterator.element_spec = self.element_spec # pylint: disable=protected-access return iterator @@ -570,18 +634,45 @@ class DistributedDatasetsFromFunction(_IterableInput): self._strategy = strategy def __iter__(self): - iterators = [] - for i, ctx in enumerate(self._input_contexts): - worker = self._input_workers.worker_devices[i] - with ops.device(worker): - dataset = self._dataset_fn(ctx) - devices = self._input_workers.compute_devices_for_worker(i) - iterator = _SingleWorkerDatasetIterator(dataset, worker, devices) - iterators.append(iterator) + if not (context.executing_eagerly() or + ops.get_default_graph().building_function): + raise RuntimeError("__iter__() is only supported inside of tf.function " + "or when eager execution is enabled.") + iterators = _create_iterators_per_worker_with_input_context( + self._input_contexts, self._input_workers, self._dataset_fn) return DistributedIterator(self._input_workers, iterators, self._strategy) +class DistributedDatasetsFromFunctionV1(DistributedDatasetsFromFunction): + """Inputs created from dataset function.""" + + def _make_initializable_iterator(self, shared_name=None): + """Get an initializable iterator for DistributedDatasetsFromFunctionV1.""" + del shared_name # Unused + # Eager mode generates already initialized iterators. Hence we cannot create + # an initializable iterator. + if context.executing_eagerly(): + raise ValueError("Cannot create initializable iterator in Eager mode. " + "Please use `iter()` instead.") + return self._get_iterator() + + def _make_one_shot_iterator(self): + """Get an iterator for iterating over DistributedDatasetsFromFunctionV1.""" + # Graph mode with one shot iterator is disabled because we have to call + # `initialize` on the iterator which is only required if we are using a + # tf.distribute strategy. + if not context.executing_eagerly(): + raise ValueError("Cannot create a one shot iterator. Please use " + "`make_initializable_iterator()` instead.") + return self._get_iterator() + + def _get_iterator(self): + iterators = _create_iterators_per_worker_with_input_context( + self._input_contexts, self._input_workers, self._dataset_fn) + return DistributedIteratorV1(self._input_workers, iterators, self._strategy) + + # TODO(anjalisridhar): This class will be soon be removed in favor of newer # APIs. class InputFunctionIterator(DistributedIteratorV1): @@ -668,7 +759,7 @@ class DatasetIterator(DistributedIteratorV1): dist_dataset._cloned_datasets, input_workers) # pylint: disable=protected-access super(DatasetIterator, self).__init__( input_workers, - worker_iterators, + worker_iterators, # pylint: disable=protected-access strategy) self.element_spec = dist_dataset.element_spec # pylint: disable=protected-access @@ -863,6 +954,21 @@ def _create_iterators_per_worker(worker_datasets, input_workers): return iterators +def _create_iterators_per_worker_with_input_context(input_contexts, + input_workers, + dataset_fn): + """Create a multidevice iterator per workers given a dataset function.""" + iterators = [] + for i, ctx in enumerate(input_contexts): + worker = input_workers.worker_devices[i] + with ops.device(worker): + dataset = dataset_fn(ctx) + devices = input_workers.compute_devices_for_worker(i) + iterator = _SingleWorkerDatasetIterator(dataset, worker, devices) + iterators.append(iterator) + return iterators + + # TODO(sourabhbajaj): Remove this in lieu of distributed datasets def _get_batched_dataset(d): """Get the batched dataset from `d`.""" diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py index 811bd2541e8..0afbb831ccc 100644 --- a/tensorflow/python/distribute/mirrored_strategy.py +++ b/tensorflow/python/distribute/mirrored_strategy.py @@ -556,7 +556,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): input_pipeline_id=i, num_replicas_in_sync=self._num_replicas_in_sync)) - return input_lib.DistributedDatasetsFromFunction( + return input_lib.get_distributed_datasets_from_function( dataset_fn, self._input_workers, input_contexts, diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py index 7e606dbd500..4e8f14ef4b6 100644 --- a/tensorflow/python/distribute/mirrored_strategy_test.py +++ b/tensorflow/python/distribute/mirrored_strategy_test.py @@ -1183,7 +1183,7 @@ class MultiWorkerMirroredStrategyTestWithChief( strategy = mirrored_strategy.MirroredStrategy() self.assertIsInstance(strategy.extended._inferred_cross_device_ops, cross_device_ops_lib.NcclAllReduce) - self.skipTest('b/130551176, run the following once fixed.') + self.skipTest("b/130551176, run the following once fixed.") self._test_minimize_loss_graph(strategy, learning_rate=0.05) def testInitializeFromTFConfig(self): diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py index 8381a4d34cd..6a79b86a5fd 100644 --- a/tensorflow/python/distribute/one_device_strategy.py +++ b/tensorflow/python/distribute/one_device_strategy.py @@ -300,7 +300,7 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1): self._container_strategy()) def _experimental_distribute_datasets_from_function(self, dataset_fn): - return input_lib.DistributedDatasetsFromFunction( + return input_lib.get_distributed_datasets_from_function( dataset_fn, self._input_workers, [distribute_lib.InputContext()], diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py index 829b54af4b6..e1a8bb370c4 100644 --- a/tensorflow/python/distribute/parameter_server_strategy.py +++ b/tensorflow/python/distribute/parameter_server_strategy.py @@ -336,7 +336,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1): input_pipeline_id=input_pipeline_id, num_replicas_in_sync=self._num_replicas_in_sync) - return input_lib.DistributedDatasetsFromFunction( + return input_lib.get_distributed_datasets_from_function( dataset_fn, self._input_workers, [input_context], diff --git a/tensorflow/python/distribute/strategy_test_lib.py b/tensorflow/python/distribute/strategy_test_lib.py index 06c791c6bdf..7f6d0b9f064 100644 --- a/tensorflow/python/distribute/strategy_test_lib.py +++ b/tensorflow/python/distribute/strategy_test_lib.py @@ -52,6 +52,7 @@ from tensorflow.python.training import optimizer from tensorflow.python.training import training_util from tensorflow.python.util import nest + class _TestException(Exception): pass @@ -308,41 +309,32 @@ class DistributionTestBase(test.TestCase): def _test_input_fn_iterable( self, strategy, input_fn, expected_values, ignore_order=False): - if context.executing_eagerly(): - self._test_input_fn_iterable_in_eager_mode( - strategy, input_fn, expected_values, ignore_order=False) - else: - self._test_input_fn_iterable_in_graph_mode( - strategy, input_fn, expected_values, ignore_order=False) - - def _test_input_fn_iterable_in_graph_mode( - self, strategy, input_fn, expected_values, ignore_order=False): - with self.assertRaisesRegexp(RuntimeError, "only supported when eager " - "execution is enabled"): - strategy.experimental_distribute_datasets_from_function(input_fn) - - def _test_input_fn_iterable_in_eager_mode( - self, strategy, input_fn, expected_values, ignore_order=False): assert_same = self.assertCountEqual if ignore_order else self.assertEqual iterable = strategy.experimental_distribute_datasets_from_function(input_fn) - iterator = iter(iterable) + if context.executing_eagerly(): + iterator = iter(iterable) - for expected_value in expected_values: - computed_value = self.evaluate( - list(strategy.experimental_local_results(next(iterator)))) - assert_same(expected_value, computed_value) + for expected_value in expected_values: + computed_value = self.evaluate( + list(strategy.experimental_local_results(next(iterator)))) + assert_same(expected_value, computed_value) - with self.assertRaises(StopIteration): - self.evaluate(strategy.experimental_local_results(next(iterator))) + with self.assertRaises(StopIteration): + self.evaluate(strategy.experimental_local_results(next(iterator))) - # After re-initializing the iterator, should be able to iterate again. - iterator = iter(iterable) + # After re-initializing the iterator, should be able to iterate again. + iterator = iter(iterable) - for expected_value in expected_values: - computed_value = self.evaluate( - list(strategy.experimental_local_results(next(iterator)))) - assert_same(expected_value, computed_value) + for expected_value in expected_values: + computed_value = self.evaluate( + list(strategy.experimental_local_results(next(iterator)))) + assert_same(expected_value, computed_value) + else: + iterator = dataset_ops.make_initializable_iterator(iterable) + self._test_input_fn_iterator(iterator, strategy.extended.worker_devices, + expected_values, test_reinitialize=True, + ignore_order=ignore_order) def _test_input_fn_iterator(self, iterator, diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py index 7aa99b9a8c4..2d301b51e41 100644 --- a/tensorflow/python/distribute/tpu_strategy.py +++ b/tensorflow/python/distribute/tpu_strategy.py @@ -264,7 +264,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1): input_pipeline_id=i, num_replicas_in_sync=self._num_replicas_in_sync)) - return input_lib.DistributedDatasetsFromFunction( + return input_lib.get_distributed_datasets_from_function( dataset_fn, self._input_workers, input_contexts, From 3a21119fce232727aae104f7912c05dc94ffdd5e Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sun, 21 Jul 2019 05:22:22 +0000 Subject: [PATCH 0249/3053] Fix python 3 test failure due to string vs byte (b'') Signed-off-by: Yong Tang --- tensorflow/python/ops/sparse_ops_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/ops/sparse_ops_test.py b/tensorflow/python/ops/sparse_ops_test.py index c78aae3cfd0..83d6645f6f1 100644 --- a/tensorflow/python/ops/sparse_ops_test.py +++ b/tensorflow/python/ops/sparse_ops_test.py @@ -131,7 +131,7 @@ class SparseOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase): values=['a', 'b'], dense_shape=[2, 3]) dense = sparse_ops.sparse_tensor_to_dense(sp) - expected_dense = [['a', '', ''], ['', '', 'b']] + expected_dense = [[b'a', b'', b''], [b'', b'', b'b']] result_dense = self.evaluate(dense) self.assertAllEqual(expected_dense, result_dense) From bc1523b0d3106aefda715ae023d7e84ce139d03b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sun, 21 Jul 2019 02:02:19 -0700 Subject: [PATCH 0250/3053] compat: Update forward compatibility horizon to 2019-07-21 PiperOrigin-RevId: 259186418 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index 330066fc91b..128253b357e 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export # This value changes every day with an automatic CL. It can be modified in code # via `forward_compatibility_horizon()` or with the environment variable # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date. -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 7, 20) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 7, 21) _FORWARD_COMPATIBILITY_HORIZON_OVERRIDDEN = False _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS" From e03b209ab00f194f3b5588298d40fec7acf7e4d3 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Sun, 21 Jul 2019 02:02:20 -0700 Subject: [PATCH 0251/3053] Update GraphDef version to 103. PiperOrigin-RevId: 259186422 --- tensorflow/core/public/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h index ad5c3c56a84..dcf8c974a63 100644 --- a/tensorflow/core/public/version.h +++ b/tensorflow/core/public/version.h @@ -108,7 +108,7 @@ limitations under the License. #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0 -#define TF_GRAPH_DEF_VERSION 102 // Updated: 2019/7/20 +#define TF_GRAPH_DEF_VERSION 103 // Updated: 2019/7/21 // Checkpoint compatibility versions (the versions field in SavedSliceMeta). // From fe440e052816570f7a42c6554360460b1e5afbbf Mon Sep 17 00:00:00 2001 From: Anjali Sridhar Date: Sun, 21 Jul 2019 09:55:58 -0700 Subject: [PATCH 0252/3053] Update tf.distribute overview doc page PiperOrigin-RevId: 259213135 --- tensorflow/python/distribute/distribute_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py index a582c0f82b8..ec85cd3f183 100644 --- a/tensorflow/python/distribute/distribute_lib.py +++ b/tensorflow/python/distribute/distribute_lib.py @@ -63,7 +63,7 @@ the same way with eager and graph execution. each replica are aggregated together before updating the model variables. This is in contrast to _asynchronous_, or _async_ training, where each replica updates the model variables independently. You may also have replicas - partitioned into gropus which are in sync within each group but async between + partitioned into groups which are in sync within each group but async between groups. * _Parameter servers_: These are machines that hold a single copy of parameters/variables, used by some strategies (right now just From 0d032cffa000d1a4da8760336b8c627f03d0cc08 Mon Sep 17 00:00:00 2001 From: Anjali Sridhar Date: Sun, 21 Jul 2019 10:18:20 -0700 Subject: [PATCH 0253/3053] Update `tf.distribute.ParameterServerStrategy` API docs. PiperOrigin-RevId: 259214298 --- .../distribute/parameter_server_strategy.py | 42 ++++++++++++------- 1 file changed, 28 insertions(+), 14 deletions(-) diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py index e1a8bb370c4..42a03038e05 100644 --- a/tensorflow/python/distribute/parameter_server_strategy.py +++ b/tensorflow/python/distribute/parameter_server_strategy.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Classes implementing a multi-worker ps DistributionStrategy.""" +"""Class implementing a multi-worker parameter server tf.distribute strategy.""" from __future__ import absolute_import from __future__ import division @@ -48,13 +48,13 @@ _LOCAL_CPU = "/device:CPU:0" # TODO(yuefengz): maybe cache variables on local CPU. @tf_export("distribute.experimental.ParameterServerStrategy", v1=[]) class ParameterServerStrategy(distribute_lib.Strategy): - """An asynchronous multi-worker parameter server DistributionStrategy. + """An asynchronous multi-worker parameter server tf.distribute strategy. - This strategy requires two jobs: workers and parameter servers. Variables and + This strategy requires two jobs: workers and parameter servers. Variables and updates to those variables will be assigned to parameter servers and other operations are assigned to workers. - When each worker has more than one GPU, operations will be replicated on these + When each worker has more than one GPU, operations will be replicated on all GPUs. Even though operations may be replicated, variables are not and each worker shares a common view for which parameter server a variable is assigned to. @@ -83,11 +83,24 @@ class ParameterServerStrategy(distribute_lib.Strategy): 2) It is also not recommended to open a colocation scope (i.e. calling `tf.compat.v1.colocate_with`) under the strategy's scope. For colocating variables, use `strategy.extended.colocate_vars_with` instead. Colocation of - ops will possibly create conflicts of device assignment. + ops will possibly create device assignment conflicts. + + Note: This strategy only works with the Estimator API. Pass an instance of + this strategy to the `experimental_distribute` argument when you create the + `RunConfig`. This instance of `RunConfig` should then be passed to the + `Estimator` instance on which `train_and_evaluate` is called. + + For Example: + ``` + strategy = tf.distribute.experimental.ParameterServerStrategy() + run_config = tf.estimator.RunConfig( + experimental_distribute.train_distribute=strategy) + estimator = tf.estimator.Estimator(config=run_config) + tf.estimator.train_and_evaluate(estimator,...) """ def __init__(self, cluster_resolver=None): - """Initializes this strategy. + """Initializes this strategy with an optional `cluster_resolver`. Args: cluster_resolver: Optional @@ -103,7 +116,7 @@ class ParameterServerStrategy(distribute_lib.Strategy): super(ParameterServerStrategy, self).__init__(extended) -@tf_export(v1=["distribute.experimental.ParameterServerStrategy"]) +@tf_export(v1=["distribute.experimental.ParameterServerStrategy"]) # pylint: disable=missing-docstring class ParameterServerStrategyV1(distribute_lib.StrategyV1): __doc__ = ParameterServerStrategy.__doc__ @@ -113,6 +126,7 @@ class ParameterServerStrategyV1(distribute_lib.StrategyV1): super(ParameterServerStrategyV1, self).__init__( ParameterServerStrategyExtended( self, cluster_resolver=cluster_resolver)) + __init__.__doc__ = ParameterServerStrategy.__init__.__doc__ # TODO(josh11b): Switch to V2 when we no longer need to support tf.compat.v1. @@ -241,7 +255,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1): compute_devices, parameter_device, cluster_resolver=None): - """Initialize internal devices for local training.""" + """Initialize local devices for training.""" worker_device = device_util.canonicalize("/device:CPU:0") self._input_host_device = numpy_dataset.SingleDevice(worker_device) @@ -359,7 +373,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1): def _allow_variable_partition(self): return not context.executing_eagerly() - # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through + # TODO(yuefengz): Not all ops in device_setter.STANDARD_PS_OPS will go through # this creator, such as "MutableHashTable". def _create_variable(self, next_creator, *args, **kwargs): if self._num_replicas_in_sync > 1: @@ -455,7 +469,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1): value_destination_pairs) def _select_single_value(self, structured): - """Select any single values in `structured`.""" + """Select any single value in `structured`.""" def _select_fn(x): # pylint: disable=g-missing-docstring if isinstance(x, values.Mirrored): @@ -523,13 +537,13 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1): cluster_spec=None, task_type=None, task_id=None): - """Configures the strategy class. + """Configures the strategy class with `cluser_spec`. - The strategy object will be re-initialized if `cluster_spec` is given but - was not passed in the constructor. + The strategy object will be re-initialized if `cluster_spec` is passed to + `configure` but was not passed when instantiating the strategy. Args: - session_config: not used currently. + session_config: Session config object. cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the cluster configurations. task_type: the current task type. From 9ea80327001157b549f6d5925ece40ad423f028d Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Sun, 21 Jul 2019 10:42:13 -0700 Subject: [PATCH 0254/3053] [XLA] Fix comment in literal_util.h. PiperOrigin-RevId: 259215452 --- tensorflow/compiler/xla/literal_util.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h index c50c0baf007..2f12db73330 100644 --- a/tensorflow/compiler/xla/literal_util.h +++ b/tensorflow/compiler/xla/literal_util.h @@ -226,8 +226,7 @@ class LiteralUtil { // in invocation between the above signature and this one. static Literal MakeTupleOwned(std::vector elements); - // This overload lets you pass a braced list of Literals to - // MakeTupleOwned: + // This overload lets you pass a list of Literals to MakeTupleOwned: // // LiteralUtil::MakeTupleOwned(LiteralUtil::CreateR1(...), ...). // From be9e080af6f1cbaa56b4ac96bb7f9fd7f273e242 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Sun, 21 Jul 2019 15:00:18 -0700 Subject: [PATCH 0255/3053] [XLA:GPU] Remove warning message for 0-byte allocations. These are legal to do (if somewhat pointless) and the warning is noisy since XLA does make 0-byte allocations from time to time. It might also be possible to stop XLA from making 0-byte allocations, but it's not clear why that is a better solution than simply making the allocator not warn about this case; it requires fewer special cases this way. PiperOrigin-RevId: 259228901 --- tensorflow/core/common_runtime/allocator_retry.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/core/common_runtime/allocator_retry.cc b/tensorflow/core/common_runtime/allocator_retry.cc index f3b51c5ca51..3402b7fd919 100644 --- a/tensorflow/core/common_runtime/allocator_retry.cc +++ b/tensorflow/core/common_runtime/allocator_retry.cc @@ -29,7 +29,6 @@ void* AllocatorRetry::AllocateRaw( alloc_func, int max_millis_to_wait, size_t alignment, size_t num_bytes) { if (num_bytes == 0) { - LOG(WARNING) << "Request to allocate 0 bytes"; return nullptr; } uint64 deadline_micros = 0; From 1783e10da1c553069d8a6398703dc230c5a68fea Mon Sep 17 00:00:00 2001 From: Till Hoffmann Date: Sun, 21 Jul 2019 23:15:45 +0100 Subject: [PATCH 0256/3053] tensorflow-gpu without nvidia-runtime. --- .../dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile | 8 +++++++- .../tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile | 8 +++++++- .../tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile | 8 +++++++- tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile | 8 +++++++- .../ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile | 8 +++++++- .../dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile | 8 +++++++- .../dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile | 8 +++++++- .../dockerfiles/ppc64le/gpu-ppc64le.Dockerfile | 8 +++++++- .../partials/ubuntu/devel-nvidia.partial.Dockerfile | 8 +++++++- .../dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile | 8 +++++++- 10 files changed, 70 insertions(+), 10 deletions(-) diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile index 02d8f89919e..a538dd36cdb 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile @@ -32,7 +32,7 @@ ARG CUDNN=7.4.1.5-1 ARG CUDNN_MAJOR_VERSION=7 ARG LIB_DIR_PREFIX=x86_64 -# Needed for string substitution +# Needed for string substitution SHELL ["/bin/bash", "-c"] RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ @@ -84,6 +84,12 @@ ARG CACHE_STOP=1 ARG CHECKOUT_TF_SRC=0 RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true +# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure +# dynamic linker run-time bindings +RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \ + && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \ + && ldconfig + ARG USE_PYTHON_3_NOT_2 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3} ARG PYTHON=python${_PY_SUFFIX} diff --git a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile index 6d00ef3c115..697be2c65bb 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile @@ -32,7 +32,7 @@ ARG CUDNN=7.4.1.5-1 ARG CUDNN_MAJOR_VERSION=7 ARG LIB_DIR_PREFIX=x86_64 -# Needed for string substitution +# Needed for string substitution SHELL ["/bin/bash", "-c"] RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ @@ -84,6 +84,12 @@ ARG CACHE_STOP=1 ARG CHECKOUT_TF_SRC=0 RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true +# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure +# dynamic linker run-time bindings +RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \ + && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \ + && ldconfig + ARG USE_PYTHON_3_NOT_2 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3} ARG PYTHON=python${_PY_SUFFIX} diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile index fde7c9e8c39..1a18e64f3fd 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile @@ -30,7 +30,7 @@ ARG ARCH ARG CUDA ARG CUDNN=7.4.1.5-1 -# Needed for string substitution +# Needed for string substitution SHELL ["/bin/bash", "-c"] # Pick up some TF dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -60,6 +60,12 @@ RUN [ ${ARCH} = ppc64le ] || (apt-get update && \ # For CUDA profiling, TensorFlow requires CUPTI. ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH +# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure +# dynamic linker run-time bindings +RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \ + && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \ + && ldconfig + ARG USE_PYTHON_3_NOT_2 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3} ARG PYTHON=python${_PY_SUFFIX} diff --git a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile index a6ff1a5ccea..07c775c362c 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile @@ -30,7 +30,7 @@ ARG ARCH ARG CUDA ARG CUDNN=7.4.1.5-1 -# Needed for string substitution +# Needed for string substitution SHELL ["/bin/bash", "-c"] # Pick up some TF dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -60,6 +60,12 @@ RUN [ ${ARCH} = ppc64le ] || (apt-get update && \ # For CUDA profiling, TensorFlow requires CUPTI. ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH +# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure +# dynamic linker run-time bindings +RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \ + && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \ + && ldconfig + ARG USE_PYTHON_3_NOT_2 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3} ARG PYTHON=python${_PY_SUFFIX} diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile index a05c718f6fb..59768aaaabc 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le-jupyter.Dockerfile @@ -32,7 +32,7 @@ ARG CUDNN=7.4.1.5-1 ARG CUDNN_MAJOR_VERSION=7 ARG LIB_DIR_PREFIX=x86_64 -# Needed for string substitution +# Needed for string substitution SHELL ["/bin/bash", "-c"] RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ @@ -84,6 +84,12 @@ ARG CACHE_STOP=1 ARG CHECKOUT_TF_SRC=0 RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true +# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure +# dynamic linker run-time bindings +RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \ + && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \ + && ldconfig + ARG USE_PYTHON_3_NOT_2 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3} ARG PYTHON=python${_PY_SUFFIX} diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile index 44d91ad067f..d4a4c928476 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/devel-gpu-ppc64le.Dockerfile @@ -32,7 +32,7 @@ ARG CUDNN=7.4.1.5-1 ARG CUDNN_MAJOR_VERSION=7 ARG LIB_DIR_PREFIX=x86_64 -# Needed for string substitution +# Needed for string substitution SHELL ["/bin/bash", "-c"] RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ @@ -84,6 +84,12 @@ ARG CACHE_STOP=1 ARG CHECKOUT_TF_SRC=0 RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true +# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure +# dynamic linker run-time bindings +RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \ + && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \ + && ldconfig + ARG USE_PYTHON_3_NOT_2 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3} ARG PYTHON=python${_PY_SUFFIX} diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile index b2f1ce152c2..b265a6039a8 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le-jupyter.Dockerfile @@ -30,7 +30,7 @@ ARG ARCH ARG CUDA ARG CUDNN=7.4.1.5-1 -# Needed for string substitution +# Needed for string substitution SHELL ["/bin/bash", "-c"] # Pick up some TF dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -60,6 +60,12 @@ RUN [ ${ARCH} = ppc64le ] || (apt-get update && \ # For CUDA profiling, TensorFlow requires CUPTI. ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH +# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure +# dynamic linker run-time bindings +RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \ + && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \ + && ldconfig + ARG USE_PYTHON_3_NOT_2 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3} ARG PYTHON=python${_PY_SUFFIX} diff --git a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile index 3422eadb60c..971d7658cb9 100644 --- a/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile +++ b/tensorflow/tools/dockerfiles/dockerfiles/ppc64le/gpu-ppc64le.Dockerfile @@ -30,7 +30,7 @@ ARG ARCH ARG CUDA ARG CUDNN=7.4.1.5-1 -# Needed for string substitution +# Needed for string substitution SHELL ["/bin/bash", "-c"] # Pick up some TF dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -60,6 +60,12 @@ RUN [ ${ARCH} = ppc64le ] || (apt-get update && \ # For CUDA profiling, TensorFlow requires CUPTI. ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH +# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure +# dynamic linker run-time bindings +RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \ + && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \ + && ldconfig + ARG USE_PYTHON_3_NOT_2 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3} ARG PYTHON=python${_PY_SUFFIX} diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile index fc0976b023f..2ba3a68c68b 100644 --- a/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile +++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile @@ -9,7 +9,7 @@ ARG CUDNN=7.4.1.5-1 ARG CUDNN_MAJOR_VERSION=7 ARG LIB_DIR_PREFIX=x86_64 -# Needed for string substitution +# Needed for string substitution SHELL ["/bin/bash", "-c"] RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential \ @@ -60,3 +60,9 @@ ARG CACHE_STOP=1 # Check out TensorFlow source code if --build-arg CHECKOUT_TF_SRC=1 ARG CHECKOUT_TF_SRC=0 RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src || true + +# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure +# dynamic linker run-time bindings +RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \ + && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \ + && ldconfig diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile index b09c6456e9c..bb9253ae2e8 100644 --- a/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile +++ b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile @@ -7,7 +7,7 @@ ARG ARCH ARG CUDA ARG CUDNN=7.4.1.5-1 -# Needed for string substitution +# Needed for string substitution SHELL ["/bin/bash", "-c"] # Pick up some TF dependencies RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -36,3 +36,9 @@ RUN [ ${ARCH} = ppc64le ] || (apt-get update && \ # For CUDA profiling, TensorFlow requires CUPTI. ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/lib64:$LD_LIBRARY_PATH + +# Link the libcuda stub to the location where tensorflow is searching for it and reconfigure +# dynamic linker run-time bindings +RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 \ + && echo "/usr/local/cuda/lib64/stubs" > /etc/ld.so.conf.d/z-cuda-stubs.conf \ + && ldconfig From ca3addea1a508bdc6bc1ab2fc2f574fd69734877 Mon Sep 17 00:00:00 2001 From: Sachin Joglekar Date: Sun, 21 Jul 2019 16:23:08 -0700 Subject: [PATCH 0257/3053] Fixes asan errors introduced due to cl/259085857 PiperOrigin-RevId: 259233558 --- .../object_detection_average_precision_stage.cc | 12 ++++++------ .../object_detection_average_precision_stage.h | 9 ++++----- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.cc b/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.cc index cfb9a300281..a8c301df65a 100644 --- a/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.cc +++ b/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.cc @@ -57,26 +57,26 @@ TfLiteStatus ObjectDetectionAveragePrecisionStage::Init() { } TfLiteStatus ObjectDetectionAveragePrecisionStage::Run() { - for (int i = 0; i < ground_truth_objects_->objects_size(); ++i) { - const int class_id = ground_truth_objects_->objects(i).class_id(); + for (int i = 0; i < ground_truth_objects_.objects_size(); ++i) { + const int class_id = ground_truth_objects_.objects(i).class_id(); if (class_id >= num_classes_) { LOG(ERROR) << "Encountered invalid class ID: " << class_id; return kTfLiteError; } ground_truth_object_vectors_[class_id].push_back(ConvertProtoToDetection( - ground_truth_objects_->objects(i), current_image_index_)); + ground_truth_objects_.objects(i), current_image_index_)); } - for (int i = 0; i < predicted_objects_->objects_size(); ++i) { - const int class_id = predicted_objects_->objects(i).class_id(); + for (int i = 0; i < predicted_objects_.objects_size(); ++i) { + const int class_id = predicted_objects_.objects(i).class_id(); if (class_id >= num_classes_) { LOG(ERROR) << "Encountered invalid class ID: " << class_id; return kTfLiteError; } predicted_object_vectors_[class_id].push_back(ConvertProtoToDetection( - predicted_objects_->objects(i), current_image_index_)); + predicted_objects_.objects(i), current_image_index_)); } current_image_index_++; diff --git a/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.h b/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.h index cf230ce697b..16b04827ae5 100644 --- a/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.h +++ b/tensorflow/lite/tools/evaluation/stages/object_detection_average_precision_stage.h @@ -42,17 +42,16 @@ class ObjectDetectionAveragePrecisionStage : public EvaluationStage { EvaluationStageMetrics LatestMetrics() override; // Call before Run(). - // Both protos must outlive the call to Run(). void SetEvalInputs(const ObjectDetectionResult& predicted_objects, const ObjectDetectionResult& ground_truth_objects) { - predicted_objects_ = &predicted_objects; - ground_truth_objects_ = &ground_truth_objects; + predicted_objects_ = predicted_objects; + ground_truth_objects_ = ground_truth_objects; } private: int num_classes_ = -1; - const ObjectDetectionResult* predicted_objects_; - const ObjectDetectionResult* ground_truth_objects_; + ObjectDetectionResult predicted_objects_; + ObjectDetectionResult ground_truth_objects_; int current_image_index_ = 0; // One inner vector per class for ground truth objects. From 4f73ebfcffd4b2a59af9bc5a7660ef52e44a461d Mon Sep 17 00:00:00 2001 From: Scott Zhu Date: Sun, 21 Jul 2019 17:30:59 -0700 Subject: [PATCH 0258/3053] Update implementation_selector to work with graph generated by Distribution Strategy. 1. Added docstring to describe what actions are done when rewriting the graph. 2. Updated to use GraphView object to traverse the node/edge. 3. Added new rewrite step to handle identity node added by IsolatePlacerInspectionRequiredOps. PiperOrigin-RevId: 259237561 --- tensorflow/core/grappler/optimizers/BUILD | 1 + .../optimizers/implementation_selector.cc | 147 +++++++++++++++--- .../optimizers/implementation_selector.h | 3 +- 3 files changed, 132 insertions(+), 19 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD index 50036a56d1d..afc8c5f7b25 100644 --- a/tensorflow/core/grappler/optimizers/BUILD +++ b/tensorflow/core/grappler/optimizers/BUILD @@ -989,6 +989,7 @@ cc_library( "//tensorflow/core/grappler:grappler_item", "//tensorflow/core/grappler:op_types", "//tensorflow/core/grappler/costs:graph_properties", + "//tensorflow/core/grappler/utils:graph_view", "@com_google_absl//absl/strings", ], ) diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.cc b/tensorflow/core/grappler/optimizers/implementation_selector.cc index 5bef9374c18..87acf85138f 100644 --- a/tensorflow/core/grappler/optimizers/implementation_selector.cc +++ b/tensorflow/core/grappler/optimizers/implementation_selector.cc @@ -26,6 +26,7 @@ limitations under the License. #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h" #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h" #include "tensorflow/core/grappler/optimizers/function_api_info.h" +#include "tensorflow/core/grappler/utils/graph_view.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/strings/strcat.h" @@ -34,15 +35,123 @@ limitations under the License. namespace tensorflow { namespace grappler { -Status UpdateNodeDef(NodeDef* node_def, const string& funcName, +// The overall idea for the function swap is like below: +// ----------- ----------- +// inp_1 ->| P_C | -> out_1 g_inp_1 ->| P_C | -> g_out_1 +// inp_2 ->| forward | -> out_2 g_inp_2 ->| backward| -> g_out_2 +// | FUNC_1 | -> out_3 g_inp_3 ->| FUNC_1 | +// ----------- ----------- +// | | | ^ ^ ^ +// v v v | | | +// s1 s2 s3 s1 s2 s3 +// | ^ +// | | +// | -------------- | +// |-----------> | Identity_1 | ---------->| +// -------------- +// P_C: op Partitioned_call or stateful_partitioned_call +// FUNC1 (forward): TF function generated for the forward path. +// FUNC1 (backward): TF function generated for the backward path. +// inp_x: input tensors for the forward path. +// out_x: output tensors for the forward path. +// g_inp_x: gradient input tensors for the backward path. +// g_out_x: gradient output tensors for the backward path. +// s_x: intermediate result generated by forward tf function, which will be +// consumed by backward function for gradient calculation. +// +// In the example above, the FUNC_1 takes 2 inputs, and return 3 outputs, in the +// meantime, generate 3 intermediate results for gradient calculation. +// The backward function will take 6 inputs, 3 for the gradient value for out_x, +// and 3 for the intermediate results s1/2/3. It returns 2 outputs for gradient +// value wrt inp_x. +// +// Given the graph, especially after the device placement is done, we could +// check if there is an alternative FUNC_2 that is better for the assigned +// device type. Note that FUNC_2 (both forward and backward) should have same +// amount of input output tensor with same dtype. However, it can generate +// different intermediate state tensor, both number wise and type wise, since it +// depends on the implementation detail. +// +// Also note that there might be some Identity op being added to the output of +// the forward function by IsolatePlacerInspectionRequiredOps for device +// placement. When the output DTYPE changes when switching from FUNC_1 to +// FUNC_2, the Identity node down the stream also need to be updated with new +// DTYPE. +// +// Based on this, the rewrite need to happen for following items: +// +// 1. P_C forward/backward need to use FUNC_2 instead of FUNC_1. +// 2. The T_IN for P_C backward need to be updated since the s_x can be +// different between FUNC_1 and FUNC_2. +// 3. The T_OUT for P_C forward need to be updated since the s_x can be +// different between FUNC_1 and FUNC_2. +// 4. The input edge for P_C backward need to be updated since the amount of +// intermediate result can be different between FUNC_1 and FUNC_2. +// 5. DTYPE of the Identity node after s_1/2/3 need to be updated if they exist. + +string FindForwardNode(utils::MutableNodeView* backward_node) { + // For the tf function, Identity op node might be added by + // placer_inspection_required_ops_utils for device placement. Those ops might + // be removed by model_pruner, or stay there if the Identity op is cross + // device. Given the partitioned_call node for backward function, we want to + // find the partitioned_call node for the forward function, so that we can + // add/remove/updated input tensors for backward function, which is the step + // 4 as described above. + + // Find the last input + const int last_input_index = backward_node->NumRegularFanins() - 1; + const utils::MutableFanoutView& input = + backward_node->GetRegularFanin(last_input_index); + // For the input node, it should either be the partitioned call, which is + // the forward node we need, or a Identity op which just pass through the + // output of the partitioned call. + if (IsIdentity(*input.node_view()->node())) { + // Find the only input to this op, which should be the original forward node + return input.node_view()->node()->input(0); + } else if (IsPartitionedCall(*input.node_view()->node()) || + IsStatefulPartitionedCall(*input.node_view()->node())) { + // Found the forward node. + return backward_node->node()->input(last_input_index); + } else { + // Unhandled situation. + return ""; + } +} + +void UpdateForwardIdentityNodeDtype(utils::MutableNodeView* forward_node, + const DataTypeVector& dtypes) { + const auto& fanouts_vector = forward_node->GetRegularFanouts(); + for (int pos = 0; pos < fanouts_vector.size(); ++pos) { + const auto& fanouts_at_pos = fanouts_vector[pos]; + for (const auto& fanout : fanouts_at_pos) { + if ("Identity" == fanout.node_view()->GetOp()) { + (*fanout.node_view()->node()->mutable_attr())["T"].set_type( + dtypes[pos]); + VLOG(3) << "Updated DTYPE for Identity node: " + << fanout.node_view()->node()->DebugString(); + } + } + } +} + +Status UpdateNodeDef(utils::MutableNodeView* node_view, const string& funcName, const FunctionApiInfo& apiInfo) { + NodeDef* node_def = node_view->node(); + VLOG(3) << "Node def before swap is: " << node_def->DebugString(); + + // For step 1 above. + node_def->mutable_attr()->find("f")->second.mutable_func()->set_name( + funcName); + + // For step 2 above. auto tin = node_def->mutable_attr()->find("Tin"); tin->second.mutable_list()->clear_type(); for (const auto& tin_dtype : apiInfo.input_arg_dtypes()) { tin->second.mutable_list()->add_type(tin_dtype); } + // For step 3 above. auto tout = node_def->mutable_attr()->find("Tout"); tout->second.mutable_list()->clear_type(); for (const auto& tout_dtype : apiInfo.output_arg_dtypes()) { @@ -50,14 +159,7 @@ Status UpdateNodeDef(NodeDef* node_def, const string& funcName, } if (apiInfo.function_type() == FunctionApiInfo::BACKWARD) { - // Update the inputs since for backward function, it might have different - // number of inputs due the different number output from forward function. - // The output of forward function are composed by two parts: - // 1. Real output tensors from defun. - // 2. Internal states that will be used for gradient calculation. - // Part 1 will be static, and part 2 could be different based on the - // different implementation. - + // For step 4 above. const int prev_input_size = node_def->input_size(); const int diff = prev_input_size - apiInfo.input_arg_dtypes().size(); if (diff >= 0) { @@ -75,7 +177,7 @@ Status UpdateNodeDef(NodeDef* node_def, const string& funcName, // input: "unified_lstm/StatefulPartitionedCall:4" // # New input should be "unified_lstm/StatefulPartitionedCall:5" // } - const string last_input = node_def->input(prev_input_size - 1); + const string last_input = FindForwardNode(node_view); const std::vector name_index = ::absl::StrSplit(last_input, ':'); if (name_index.size() != 2) { return errors::InvalidArgument( @@ -92,23 +194,25 @@ Status UpdateNodeDef(NodeDef* node_def, const string& funcName, for (int i = 1; i <= -diff; ++i) node_def->add_input(strings::StrCat(node_name, ":", i + last_index)); } + } else if (apiInfo.function_type() == FunctionApiInfo::FORWARD) { + // For forward function, since the DTYPE of the intermediate state might + // have been changed, we want to update the down stream Identity node if + // any. This is the step 5 in the commend above. + UpdateForwardIdentityNodeDtype(node_view, apiInfo.output_arg_dtypes()); } - node_def->mutable_attr()->find("f")->second.mutable_func()->set_name( - funcName); - VLOG(3) << "Node def after swap is: " << node_def->DebugString(); return Status::OK(); } Status ImplementationSelector::LoadFunctions(const GraphDef& graph) { - lib_info_.reset(new FunctionLibraryApiInfo); + lib_info_ = absl::make_unique(); TF_RETURN_IF_ERROR(lib_info_->Init(graph.library())); return Status::OK(); } Status ImplementationSelector::MaybeOptimizeFunctionCall( - NodeDef* node_def) const { + utils::MutableNodeView* node_view) const { // There are two ways of calling functions: // 1. By specifying an op name as a function name, or // 2. Via the @defun functional interface, where the real function call @@ -116,6 +220,8 @@ Status ImplementationSelector::MaybeOptimizeFunctionCall( // attribute with name "f" and type func. In this use case, there are more // attributes need to be taken care, like Tin and Tout which take care of // the DTYPE of input/output. + NodeDef* node_def = node_view->node(); + std::vector function_attribute_names; for (const auto& attr : node_def->attr()) { if (attr.second.has_func() && @@ -149,7 +255,7 @@ Status ImplementationSelector::MaybeOptimizeFunctionCall( const auto& func_api_info = lib_info_->GetApiInfo(func_name); if (func_api_info->preferred_device() == parsed_name.type) { VLOG(2) << "Swapping: " << function_name << " TO: " << func_name; - TF_RETURN_IF_ERROR(UpdateNodeDef(node_def, func_name, *func_api_info)); + TF_RETURN_IF_ERROR(UpdateNodeDef(node_view, func_name, *func_api_info)); break; } } @@ -181,8 +287,13 @@ Status ImplementationSelector::SelectImplementation(GraphDef* graph) const { return Status::OK(); } - for (int k = 0; k < graph->node_size(); ++k) - TF_RETURN_IF_ERROR(MaybeOptimizeFunctionCall(graph->mutable_node(k))); + Status status; + utils::MutableGraphView graph_view(graph, &status); + TF_RETURN_IF_ERROR(status); + + const int num_nodes = graph_view.NumNodes(); + for (int k = 0; k < num_nodes; ++k) + TF_RETURN_IF_ERROR(MaybeOptimizeFunctionCall(graph_view.GetNode(k))); return Status::OK(); } diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.h b/tensorflow/core/grappler/optimizers/implementation_selector.h index c206d21640b..2fafe4ece12 100644 --- a/tensorflow/core/grappler/optimizers/implementation_selector.h +++ b/tensorflow/core/grappler/optimizers/implementation_selector.h @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h" #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h" #include "tensorflow/core/grappler/optimizers/function_api_info.h" +#include "tensorflow/core/grappler/utils/graph_view.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/strings/strcat.h" @@ -88,7 +89,7 @@ class ImplementationSelector : public CustomGraphOptimizer { private: Status LoadFunctions(const GraphDef& graph); - Status MaybeOptimizeFunctionCall(NodeDef* node_def) const; + Status MaybeOptimizeFunctionCall(utils::MutableNodeView* node_view) const; // Finds all call sites for functions, then replace with the appropriate // implementation. From 96d0f42d1b236d21157d32805d4aa87e136083b3 Mon Sep 17 00:00:00 2001 From: Anjali Sridhar Date: Sun, 21 Jul 2019 19:35:10 -0700 Subject: [PATCH 0259/3053] Update API docs of ClusterResolver and all its implementations. PiperOrigin-RevId: 259246199 --- .../python/distribute/cluster_resolver/__init__.py | 9 ++++++++- .../distribute/cluster_resolver/cluster_resolver.py | 4 ++-- .../cluster_resolver/gce_cluster_resolver.py | 6 +++--- .../cluster_resolver/kubernetes_cluster_resolver.py | 4 ++-- .../cluster_resolver/slurm_cluster_resolver.py | 12 ++++++------ .../cluster_resolver/tfconfig_cluster_resolver.py | 7 ++++++- .../cluster_resolver/tpu_cluster_resolver.py | 4 ++-- 7 files changed, 29 insertions(+), 17 deletions(-) diff --git a/tensorflow/python/distribute/cluster_resolver/__init__.py b/tensorflow/python/distribute/cluster_resolver/__init__.py index 39ea191fb04..11de551b084 100644 --- a/tensorflow/python/distribute/cluster_resolver/__init__.py +++ b/tensorflow/python/distribute/cluster_resolver/__init__.py @@ -12,7 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Library Imports for Cluster Resolvers.""" +"""Library imports for ClusterResolvers. + + This library contains all implementations of ClusterResolvers. + ClusterResolvers are a way of specifying cluster information for distributed + execution. Built on top of existing `ClusterSpec` framework, ClusterResolvers + are a way for TensorFlow to communicate with various cluster management + systems (e.g. GCE, AWS, etc...). +""" from __future__ import absolute_import from __future__ import division diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py index c636c98254c..5b61f847801 100644 --- a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py +++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py @@ -90,7 +90,7 @@ class ClusterResolver(object): @abc.abstractmethod def cluster_spec(self): - """Retrieve the current state of the cluster and returns a ClusterSpec. + """Retrieve the current state of the cluster and return a ClusterSpec. Returns: A ClusterSpec representing the state of the cluster at the moment this @@ -288,7 +288,7 @@ class UnionClusterResolver(ClusterResolver): when cluster_spec is called. The details of the merge function is documented in the cluster_spec function. - For additional Cluster Resolver properties such as task type, task index, + For additional ClusterResolver properties such as task type, task index, rpc layer, environment, etc..., we will return the value from the first ClusterResolver in the union. """ diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py index 9d7dfdd1ea9..70d42e80a70 100644 --- a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py +++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Implementation of Cluster Resolvers for GCE Instance Groups.""" +"""Implementation of ClusterResolvers for GCE instance groups.""" from __future__ import absolute_import from __future__ import division @@ -33,12 +33,12 @@ except ImportError: @tf_export('distribute.cluster_resolver.GCEClusterResolver') class GCEClusterResolver(ClusterResolver): - """Cluster Resolver for Google Compute Engine. + """ClusterResolver for Google Compute Engine. This is an implementation of cluster resolvers for the Google Compute Engine instance group platform. By specifying a project, zone, and instance group, this will retrieve the IP address of all the instances within the instance - group and return a Cluster Resolver object suitable for use for distributed + group and return a ClusterResolver object suitable for use for distributed TensorFlow. """ diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py index 28b2712590d..f812df0e5c7 100644 --- a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py +++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py @@ -33,7 +33,7 @@ except ImportError: @tf_export('distribute.cluster_resolver.KubernetesClusterResolver') class KubernetesClusterResolver(ClusterResolver): - """Cluster Resolver for Kubernetes. + """ClusterResolver for Kubernetes. This is an implementation of cluster resolvers for Kubernetes. When given the the Kubernetes namespace and label selector for pods, we will retrieve the @@ -48,7 +48,7 @@ class KubernetesClusterResolver(ClusterResolver): override_client=None): """Initializes a new KubernetesClusterResolver. - This initializes a new Kubernetes Cluster Resolver. The Cluster Resolver + This initializes a new Kubernetes ClusterResolver. The ClusterResolver will attempt to talk to the Kubernetes master to retrieve all the instances of pods matching a label selector. diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py index 0e49cebee2b..1d6d346ddf2 100644 --- a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py +++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py @@ -30,13 +30,13 @@ from tensorflow.python.util.tf_export import tf_export @tf_export('distribute.cluster_resolver.SlurmClusterResolver') class SlurmClusterResolver(ClusterResolver): - """Cluster Resolver for system with Slurm workload manager. + """ClusterResolver for system with Slurm workload manager. This is an implementation of cluster resolvers for Slurm clusters. This allows the specification of jobs and task counts, number of tasks per node, number of - GPUs on each node and number of GPUs for each task, It retrieves system + GPUs on each node and number of GPUs for each task. It retrieves system attributes by Slurm environment variables, resolves allocated computing node - names, construct a cluster and return a Cluster Resolver object which an be + names, constructs a cluster and returns a ClusterResolver object which can be use for distributed TensorFlow. """ @@ -61,15 +61,15 @@ class SlurmClusterResolver(ClusterResolver): """Creates a new SlurmClusterResolver object. This takes in parameters and creates a SlurmClusterResolver object. It uses - those parameters to check which nodes will processes reside and resolves + those parameters to check which nodes will processes reside on and resolves their hostnames. With the number of the GPUs on each node and number of GPUs - for each task it offsets the port number for each processes and allocate + for each task it offsets the port number for each process and allocates GPUs to tasks by setting environment variables. The resolver currently supports homogeneous tasks and default Slurm process allocation. Args: jobs: Dictionary with job names as key and number of tasks in the job as - value + value. port_base: The first port number to start with for processes on a node. gpus_per_node: Number of GPUs available on each node. gpus_per_task: Number of GPUs to be used for each task. diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py index c9b6191a1c0..421351944c2 100644 --- a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py +++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py @@ -50,7 +50,12 @@ def _get_value_in_tfconfig(key, default=None): @tf_export('distribute.cluster_resolver.TFConfigClusterResolver') class TFConfigClusterResolver(ClusterResolver): - """Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar.""" + """Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar. + + This is an implementation of cluster resolvers when using TF_CONFIG to set + information about the cluster. The cluster spec returned will be + initialized from the TF_CONFIG environment variable. + """ def __init__(self, task_type=None, diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py index 253708c132c..757d2a47b64 100644 --- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py +++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py @@ -94,7 +94,7 @@ class TPUClusterResolver(ClusterResolver): This works around an issue where the underlying HTTP connection sometimes times out when the script has been running for too long. Other methods in - this object calls this method to get a new API object whenever they need + this object call this method to get a new API object whenever they need to communicate with the Cloud API. Returns: @@ -206,7 +206,7 @@ class TPUClusterResolver(ClusterResolver): for the IP addresses and ports of each Cloud TPU listed. Args: - tpu: A string corresponding to the TPU to use. If the string is the empty + tpu: A string corresponding to the TPU to use. If the string is an empty string, the string 'local', or a string that begins with 'grpc://' or '/bns', then it is assumed to not correspond with a Cloud TPU and will instead be passed as the session master and no ClusterSpec propagation From 4390c4f8463bc5fb8e52fc2b4749951cdfca64ce Mon Sep 17 00:00:00 2001 From: amoitra Date: Sun, 21 Jul 2019 20:55:35 -0700 Subject: [PATCH 0260/3053] minor fix - missed something during merge conflict resolution --- tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc index ffda48872f2..25a821cb078 100755 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc @@ -561,7 +561,7 @@ StatusOr RunOnInstruction(HloInstruction* conv) { conv->feature_group_count(), conv->metadata()); } - std::tie(match, window, dnums) = MatchBackwardFilter(conv); + std::tie(match, window, dnums, lhs) = MatchBackwardFilter(conv); if (match) { return CreateCudnnConv(kCudnnConvBackwardFilterCallTarget, conv->shape(), lhs, conv->mutable_operand(1), window, dnums, From b4e562543795c5e48e8c751d795449a8621ac720 Mon Sep 17 00:00:00 2001 From: Chao Mei Date: Sun, 21 Jul 2019 21:01:16 -0700 Subject: [PATCH 0261/3053] Use a simpler external Cpu backend context class to replace the actual functionalities of the existing ref-counted cpu backend context class. PiperOrigin-RevId: 259252521 --- tensorflow/lite/BUILD | 11 ++ .../lite/external_cpu_backend_context.cc | 38 ++++++ .../lite/external_cpu_backend_context.h | 110 ++++++++++++++++++ tensorflow/lite/interpreter.cc | 26 +++++ tensorflow/lite/interpreter.h | 13 ++- tensorflow/lite/kernels/BUILD | 3 +- .../lite/kernels/cpu_backend_context.cc | 3 +- tensorflow/lite/kernels/cpu_backend_context.h | 13 +-- .../lite/kernels/cpu_backend_support.cc | 85 +++++--------- 9 files changed, 232 insertions(+), 70 deletions(-) create mode 100644 tensorflow/lite/external_cpu_backend_context.cc create mode 100644 tensorflow/lite/external_cpu_backend_context.h diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD index c5742adce6f..e97de3d0f2e 100644 --- a/tensorflow/lite/BUILD +++ b/tensorflow/lite/BUILD @@ -94,6 +94,16 @@ cc_library( deps = ["//tensorflow/lite/c:c_api_internal"], ) +cc_library( + name = "external_cpu_backend_context", + srcs = ["external_cpu_backend_context.cc"], + hdrs = ["external_cpu_backend_context.h"], + copts = TFLITE_DEFAULT_COPTS, + deps = [ + "//tensorflow/lite/c:c_api_internal", + ], +) + cc_library( name = "graph_info", hdrs = ["graph_info.h"], @@ -201,6 +211,7 @@ cc_library( deps = [ ":allocation", ":arena_planner", + ":external_cpu_backend_context", ":graph_info", ":memory_planner", ":minimal_logging", diff --git a/tensorflow/lite/external_cpu_backend_context.cc b/tensorflow/lite/external_cpu_backend_context.cc new file mode 100644 index 00000000000..2be35c8baf7 --- /dev/null +++ b/tensorflow/lite/external_cpu_backend_context.cc @@ -0,0 +1,38 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/lite/external_cpu_backend_context.h" + +namespace tflite { +namespace { + +TfLiteStatus RefreshExternalCpuBackendContext(TfLiteContext* context) { + auto* const external_context = static_cast( + context->GetExternalContext(context, kTfLiteCpuBackendContext)); + if (external_context && external_context->internal_backend_context() && + context->recommended_num_threads != -1) { + external_context->internal_backend_context()->set_max_num_threads( + context->recommended_num_threads); + } + return kTfLiteOk; +} +} // namespace + +ExternalCpuBackendContext::ExternalCpuBackendContext() + : internal_backend_context_(nullptr) { + this->type = kTfLiteCpuBackendContext; + this->Refresh = RefreshExternalCpuBackendContext; +} + +} // namespace tflite diff --git a/tensorflow/lite/external_cpu_backend_context.h b/tensorflow/lite/external_cpu_backend_context.h new file mode 100644 index 00000000000..0d8763532c7 --- /dev/null +++ b/tensorflow/lite/external_cpu_backend_context.h @@ -0,0 +1,110 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_EXTERNAL_CPU_BACKEND_CONTEXT_H_ +#define TENSORFLOW_LITE_EXTERNAL_CPU_BACKEND_CONTEXT_H_ + +#include +#include + +#include "tensorflow/lite/c/c_api_internal.h" + +namespace tflite { + +// This is the base class for TF Lite internal backend contexts (like a +// RUY-based cpu backend context class). A derived internal backend context is +// generally a collection of utilities (i.e. a thread pool etc.) for TF Lite to +// use certain keneral libraries, such as Gemmlowp, RUY, etc., to implement TF +// Lite operators. +// TODO(b/130950871): Make this class as a interface-only abstract class. +class TfLiteInternalBackendContext { + public: + virtual ~TfLiteInternalBackendContext() {} + + int max_num_threads() const { return max_num_threads_; } + + virtual void set_max_num_threads(int max_num_threads) { + max_num_threads_ = max_num_threads; + } + + protected: + TfLiteInternalBackendContext() {} + + // The maximum number of threads used for parallelizing TfLite computation. + int max_num_threads_; + + private: + TfLiteInternalBackendContext(const TfLiteInternalBackendContext&) = delete; + TfLiteInternalBackendContext& operator=(const TfLiteInternalBackendContext&) = + delete; +}; + +// This TfLiteExternalContext-derived class is the default +// 'kTfLiteCpuBackendContext'-typed context that's used internally in TF Lite +// framework. The primary purpose of having this class is to allow the same cpu +// backend context to be sharable among a set of TF Lite interpreters so that +// certain system costs are saved, like saving the cost of having multiple +// thread pools in each separate cpu backend context etc.. +// +// Note: as of 2019/07/19, such context sharing among a set of interpreters will +// break the execution if these interpreters are invoked simultaneously. It +// works only when these context-sharing interpreters are invoked in a +// serialized way. Here's an example to illustrate the context sharing among 2 +// TF Lite interpreters: +// +// TfLiteInternalBackendContext* global_ctxt = new ExternalCpuBackendContext(); +// interpreter1 = /*...*/; +// interpreter1->SetExternalContext(kTfLiteCpuBackendContext, global_ctxt); +// interpreter2 = /*...*/; +// interpreter2->SetExternalContext(kTfLiteCpuBackendContext, global_ctxt); +// +// interpreter1->SetNumThreads(2); +// interpreter1->Invoke(); +// +// interpreter2->SetNumThreads(4); +// interpreter2->Invoke(); +// +// After sharing the context, calling 'SetNumThreads' on any of the +// context-sharing interpreters will have the global impact as it also refreshes +// the #thread info in the global cpu backend context (i.e. 'global_ctxt' above) +// that affects how much parallelism an interpreter invocation will use. +// Therefore, if different number of threads are used among different +// interpreters, don't call 'SetNumThreads' consectutively but call it +// separately between each interpreter's invocation as illustrated above. +class ExternalCpuBackendContext : public TfLiteExternalContext { + public: + ExternalCpuBackendContext(); + ~ExternalCpuBackendContext() {} + + void set_internal_backend_context( + std::unique_ptr internal_backend_context) { + internal_backend_context_ = std::move(internal_backend_context); + } + + TfLiteInternalBackendContext* internal_backend_context() const { + return internal_backend_context_.get(); + } + + private: + // Note the actual internal backend context object is lazily initialized. + std::unique_ptr internal_backend_context_; + + ExternalCpuBackendContext(const ExternalCpuBackendContext&) = delete; + ExternalCpuBackendContext& operator=(const ExternalCpuBackendContext&) = + delete; +}; + +} // namespace tflite + +#endif // TENSORFLOW_LITE_EXTERNAL_CPU_BACKEND_CONTEXT_H_ diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc index 99d117591fd..bf72f7822ad 100644 --- a/tensorflow/lite/interpreter.cc +++ b/tensorflow/lite/interpreter.cc @@ -71,6 +71,12 @@ Interpreter::Interpreter(ErrorReporter* error_reporter) external_contexts_[i] = nullptr; } + // This operation is cheap because we allocate the CPU context resources (i.e. + // threads) lazily. + own_external_cpu_backend_context_.reset(new ExternalCpuBackendContext()); + external_contexts_[kTfLiteCpuBackendContext] = + own_external_cpu_backend_context_.get(); + UseNNAPI(false); } @@ -78,6 +84,26 @@ Interpreter::~Interpreter() {} void Interpreter::SetExternalContext(TfLiteExternalContextType type, TfLiteExternalContext* ctx) { + if (ctx == own_external_cpu_backend_context_.get()) { + error_reporter_->Report( + "WARNING: The passed external context is identical to the internally " + "owned one."); + return; + } + + // We have an internally owned external context of kTfLiteCpuBackendContext. + // If it's overwritten here, we will release the resource of the internally + // owned external context. + // Note: the 'max thread count' info associated with the overwritten context + // will be lost here, and such info is now detemined by the new context, thus + // affecting how much parallelism a TFLite op would have. + if (kTfLiteCpuBackendContext == type && + external_contexts_[kTfLiteCpuBackendContext] == + own_external_cpu_backend_context_.get()) { + own_external_cpu_backend_context_.reset(); + } + + // This essentially changes the "external_contexts_[type]". primary_subgraph().SetExternalContext(type, ctx); } diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h index b1353175530..8eef58530e2 100644 --- a/tensorflow/lite/interpreter.h +++ b/tensorflow/lite/interpreter.h @@ -20,6 +20,7 @@ limitations under the License. #include #include #include +#include #include #include "tensorflow/lite/allocation.h" @@ -27,6 +28,7 @@ limitations under the License. #include "tensorflow/lite/core/api/error_reporter.h" #include "tensorflow/lite/core/api/profiler.h" #include "tensorflow/lite/core/subgraph.h" +#include "tensorflow/lite/external_cpu_backend_context.h" #include "tensorflow/lite/memory_planner.h" #include "tensorflow/lite/stderr_reporter.h" @@ -460,7 +462,9 @@ class Interpreter { return op_reg.profiling_string(context_, node); } - /// Set the value of an external context. + // Set the value of an external context. TFLite interpreter doesn't take the + // memory ownership of this external context 'ctx', and the context should + // outlive the TFLite interpreter. void SetExternalContext(TfLiteExternalContextType type, TfLiteExternalContext* ctx); @@ -526,6 +530,13 @@ class Interpreter { // List of active external contexts. TfLiteExternalContext* external_contexts_[kTfLiteMaxExternalContexts]; + // The default external cpu backend context. After an TFLite interpreter is + // initialized, 'external_contexts_[kTfLiteCpuBackendContext]' is set to point + // to this object. However, if this element value is overwritten via calling + // 'SetExternalContext(kTfLiteCpuBackendContext, ...)', we will reset this to + // nullptr if necessary. + std::unique_ptr own_external_cpu_backend_context_; + // Subgraphs std::vector> subgraphs_; }; diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD index a75404eb276..ee9090902ce 100644 --- a/tensorflow/lite/kernels/BUILD +++ b/tensorflow/lite/kernels/BUILD @@ -227,6 +227,7 @@ cc_library( # gemmlowp_context_ and ruy_context_ members. "//tensorflow/lite/experimental/ruy:context", "@gemmlowp", + "//tensorflow/lite:external_cpu_backend_context", ], ) @@ -319,8 +320,8 @@ cc_library( deps = [ ":cpu_backend_context", ":op_macros", + "//tensorflow/lite:external_cpu_backend_context", "//tensorflow/lite/c:c_api_internal", - "@gemmlowp", ], ) diff --git a/tensorflow/lite/kernels/cpu_backend_context.cc b/tensorflow/lite/kernels/cpu_backend_context.cc index 15ab1bc7a67..f9a1ee0a86b 100644 --- a/tensorflow/lite/kernels/cpu_backend_context.cc +++ b/tensorflow/lite/kernels/cpu_backend_context.cc @@ -21,7 +21,8 @@ limitations under the License. namespace tflite { CpuBackendContext::CpuBackendContext() - : ruy_context_(new ruy::Context), + : TfLiteInternalBackendContext(), + ruy_context_(new ruy::Context), gemmlowp_context_(new gemmlowp::GemmContext) { set_max_num_threads(1); } diff --git a/tensorflow/lite/kernels/cpu_backend_context.h b/tensorflow/lite/kernels/cpu_backend_context.h index 066d4a10b8d..00b12d8ba54 100644 --- a/tensorflow/lite/kernels/cpu_backend_context.h +++ b/tensorflow/lite/kernels/cpu_backend_context.h @@ -20,13 +20,14 @@ limitations under the License. #include "public/gemmlowp.h" #include "tensorflow/lite/experimental/ruy/context.h" +#include "tensorflow/lite/external_cpu_backend_context.h" namespace tflite { -class CpuBackendContext final { +class CpuBackendContext final : public TfLiteInternalBackendContext { public: CpuBackendContext(); - ~CpuBackendContext(); + ~CpuBackendContext() override; ruy::Context* ruy_context() const { return ruy_context_.get(); } @@ -44,10 +45,7 @@ class CpuBackendContext final { // // This value also gets propagated to back-ends, where it plays the same // information-only role. - void set_max_num_threads(int max_num_threads); - - // See set_max_num_threads. - int max_num_threads() const { return max_num_threads_; } + void set_max_num_threads(int max_num_threads) override; private: // To enable a smooth transition from the current direct usage @@ -59,9 +57,6 @@ class CpuBackendContext final { const std::unique_ptr ruy_context_; const std::unique_ptr gemmlowp_context_; - // See set_max_num_threads. - int max_num_threads_; - CpuBackendContext(const CpuBackendContext&) = delete; }; diff --git a/tensorflow/lite/kernels/cpu_backend_support.cc b/tensorflow/lite/kernels/cpu_backend_support.cc index 5d7f41ab4e8..64a41b2e1ec 100644 --- a/tensorflow/lite/kernels/cpu_backend_support.cc +++ b/tensorflow/lite/kernels/cpu_backend_support.cc @@ -17,74 +17,43 @@ limitations under the License. #include #include "tensorflow/lite/c/c_api_internal.h" +#include "tensorflow/lite/external_cpu_backend_context.h" #include "tensorflow/lite/kernels/cpu_backend_context.h" #include "tensorflow/lite/kernels/op_macros.h" namespace tflite { namespace cpu_backend_support { -namespace { - -// TODO(b/130950871) we probably shouldn't be using any reference-counting -// but this is an existing idiom. -struct RefCountedCpuBackendContext : public TfLiteExternalContext { - std::unique_ptr cpu_backend_context; - int num_references = 0; -}; - -RefCountedCpuBackendContext* GetCpuBackendContext(TfLiteContext* context) { - return static_cast( - context->GetExternalContext(context, kTfLiteCpuBackendContext)); -} - -TfLiteStatus Refresh(TfLiteContext* context) { - auto* refcounted = GetCpuBackendContext(context); - if (refcounted != nullptr) { - refcounted->cpu_backend_context->set_max_num_threads( - context->recommended_num_threads); - } - return kTfLiteOk; -} - -} // namespace - -void IncrementUsageCounter(TfLiteContext* context) { - RefCountedCpuBackendContext* refcounted = GetCpuBackendContext(context); - if (refcounted == nullptr) { - refcounted = new RefCountedCpuBackendContext; - refcounted->type = kTfLiteCpuBackendContext; - refcounted->Refresh = Refresh; - refcounted->cpu_backend_context.reset(new CpuBackendContext); - if (context->recommended_num_threads != -1) { - refcounted->cpu_backend_context->set_max_num_threads( - context->recommended_num_threads); - } - refcounted->num_references = 0; - context->SetExternalContext(context, kTfLiteCpuBackendContext, refcounted); - } - refcounted->num_references++; -} - -void DecrementUsageCounter(TfLiteContext* context) { - RefCountedCpuBackendContext* refcounted = GetCpuBackendContext(context); - if (refcounted == nullptr) { - TF_LITE_FATAL( - "Call to DecrementUsageCounter() not preceded by " - "IncrementUsageCounter()"); - } - if (--refcounted->num_references == 0) { - delete refcounted; - context->SetExternalContext(context, kTfLiteCpuBackendContext, nullptr); - } -} +// TODO(b/130950871): Remove all refrences to the following two no-op functions +// once the new ExternalCpuBackendContext class is checked in. +void IncrementUsageCounter(TfLiteContext* context) {} +void DecrementUsageCounter(TfLiteContext* context) {} CpuBackendContext* GetFromContext(TfLiteContext* context) { - RefCountedCpuBackendContext* refcounted = GetCpuBackendContext(context); - if (refcounted == nullptr) { + auto* external_context = static_cast( + context->GetExternalContext(context, kTfLiteCpuBackendContext)); + + if (external_context == nullptr) { TF_LITE_FATAL( - "Call to GetFromContext() not preceded by IncrementUsageCounter()"); + "ExternalCpuBackendContext isn't properly initialized during TFLite " + "interpreter initialization."); } - return refcounted->cpu_backend_context.get(); + + auto* cpu_backend_context = static_cast( + external_context->internal_backend_context()); + if (cpu_backend_context == nullptr) { + // We do the lazy initialization here for the TfLiteInternalBackendContext + // that's wrapped inside ExternalCpuBackendContext. + cpu_backend_context = new CpuBackendContext(); + if (context->recommended_num_threads != -1) { + cpu_backend_context->set_max_num_threads( + context->recommended_num_threads); + } + external_context->set_internal_backend_context( + std::unique_ptr(cpu_backend_context)); + } + + return cpu_backend_context; } } // namespace cpu_backend_support From c97d30d0041ad139ebe372a71bce54e81526be45 Mon Sep 17 00:00:00 2001 From: Mei Jie <535370561@qq.com> Date: Mon, 22 Jul 2019 15:32:38 +0800 Subject: [PATCH 0262/3053] Update metric_ops.py for incorrect docstring #30848 --- tensorflow/contrib/metrics/python/ops/metric_ops.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py index eae04c7ba3e..b3f4d8c40c1 100644 --- a/tensorflow/contrib/metrics/python/ops/metric_ops.py +++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py @@ -1161,8 +1161,9 @@ def streaming_dynamic_auc(labels, and performing the final calculation using all of the concatenated values. Args: - labels: A `Tensor` of ground truth labels with the same shape as `labels` - and with values of 0 or 1 whose values are castable to `int64`. + labels: A `Tensor` of ground truth labels with the same shape as + `predictions` and with values of 0 or 1 whose values are castable to + `int64`. predictions: A `Tensor` of predictions whose values are castable to `float64`. Will be flattened into a 1-D `Tensor`. curve: The name of the curve for which to compute AUC, 'ROC' for the From 5471b5f66ed10ef49bce250746e7e73ec0ccf2be Mon Sep 17 00:00:00 2001 From: amoitra Date: Mon, 22 Jul 2019 00:33:24 -0700 Subject: [PATCH 0263/3053] Few more changes --- .../xla/service/gpu/cudnn_conv_rewriter.cc | 30 ++++++++++++++----- 1 file changed, 22 insertions(+), 8 deletions(-) mode change 100755 => 100644 tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc old mode 100755 new mode 100644 index 25a821cb078..a441e70510a --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc @@ -278,26 +278,40 @@ MatchBackwardFilter(HloInstruction* conv) { reshape_dims.insert(reshape_dims.begin() + input_batch_dimension, num_groups); HloComputation* c = conv->parent(); - lhs = c->AddInstruction(HloInstruction::CreateReshape( - ShapeUtil::MakeShape(lhs->shape().element_type(), reshape_dims), lhs)); + HloInstruction* lhs_reshape_1 = + c->AddInstruction(HloInstruction::CreateReshape( + ShapeUtil::MakeShape(lhs->shape().element_type(), reshape_dims), + lhs)); // Transpose G to the axis before C/G, For eg: [G, N, C/G, H, W] -> [N, G, // C/G, H, W] - std::vector transpose_dims(lhs->shape().dimensions_size()); + std::vector transpose_dims(lhs_reshape_1->shape().dimensions_size()); std::iota(transpose_dims.begin(), transpose_dims.end(), 0); transpose_dims.erase(transpose_dims.begin() + input_batch_dimension); transpose_dims.insert(transpose_dims.begin() + input_feature_dimension, input_batch_dimension); - lhs = c->AddInstruction( - HloInstruction::CreateTranspose(lhs->shape(), lhs, transpose_dims)); + std::vector transpose_reshape_dims = + lhs_reshape_1->shape().dimensions(); + transpose_reshape_dims.erase(transpose_reshape_dims.begin() + + input_batch_dimension); + transpose_reshape_dims.insert( + transpose_reshape_dims.begin() + input_feature_dimension, num_groups); + + HloInstruction* lhs_transpose = + c->AddInstruction(HloInstruction::CreateTranspose( + ShapeUtil::MakeShape(lhs_reshape_1->shape().element_type(), + transpose_reshape_dims), + lhs_reshape_1, transpose_dims)); // Merge [G,C/G] -> [C] - Shape new_shape = lhs->shape(); + Shape new_shape = lhs_transpose->shape(); new_shape.DeleteDimension(input_feature_dimension); new_shape.set_dimensions(input_feature_dimension, input_feature * conv->feature_group_count()); - lhs = c->AddInstruction(HloInstruction::CreateReshape(new_shape, lhs)); - return std::make_tuple(true, backward_conv_window, backward_conv_dnums, lhs); + HloInstruction* lhs_reshape_2 = c->AddInstruction( + HloInstruction::CreateReshape(new_shape, lhs_transpose)); + return std::make_tuple(true, backward_conv_window, backward_conv_dnums, + lhs_reshape_2); } // Try to match a backward input pattern that contains "conv". From 04491137a7df50d6ac8b116c1f9eca4b479deee9 Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 9 Jul 2019 08:41:29 +0530 Subject: [PATCH 0264/3053] Removed Depricated API from the file. --- tensorflow/contrib/distributions/python/ops/inverse_gamma.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py index 9f1e9d5cd1b..d7c1de10a42 100644 --- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py +++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py @@ -236,7 +236,7 @@ class InverseGamma(distribution.Distribution): self.batch_shape_tensor(), np.array(np.nan, dtype=self.dtype.as_numpy_dtype()), name="nan") - return array_ops.where(self.concentration > 1., mean, nan) + return array_ops.where_v2(self.concentration > 1., mean, nan) else: return control_flow_ops.with_dependencies([ check_ops.assert_less( @@ -257,7 +257,7 @@ class InverseGamma(distribution.Distribution): self.batch_shape_tensor(), np.array(np.nan, dtype=self.dtype.as_numpy_dtype()), name="nan") - return array_ops.where(self.concentration > 2., var, nan) + return array_ops.where_v2(self.concentration > 2., var, nan) else: return control_flow_ops.with_dependencies([ check_ops.assert_less( From eff041ae16d32c960a8d5c52b54277564c823ca4 Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 9 Jul 2019 08:43:55 +0530 Subject: [PATCH 0265/3053] Removed Depricated API from the file. --- tensorflow/contrib/distributions/python/ops/kumaraswamy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py index e3712dd84e3..56f35c28b1b 100644 --- a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py +++ b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py @@ -235,7 +235,7 @@ class Kumaraswamy(transformed_distribution.TransformedDistribution): np.array(np.nan, dtype=self.dtype.as_numpy_dtype), name="nan") is_defined = (self.concentration1 > 1.) & (self.concentration0 > 1.) - return array_ops.where(is_defined, mode, nan) + return array_ops.where_v2(is_defined, mode, nan) return control_flow_ops.with_dependencies([ check_ops.assert_less( From 6334b2c65bc0cf466310b4fbbe04c46e74282a05 Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 9 Jul 2019 08:47:49 +0530 Subject: [PATCH 0266/3053] Removed Depricated API from the file. --- tensorflow/contrib/distributions/python/ops/binomial.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/distributions/python/ops/binomial.py b/tensorflow/contrib/distributions/python/ops/binomial.py index b349e5966dd..2cd80507c88 100644 --- a/tensorflow/contrib/distributions/python/ops/binomial.py +++ b/tensorflow/contrib/distributions/python/ops/binomial.py @@ -68,9 +68,9 @@ def _bdtr(k, n, p): # where(unsafe, safe_output, betainc(where(unsafe, safe_input, input))) ones = array_ops.ones_like(n - k) k_eq_n = math_ops.equal(k, n) - safe_dn = array_ops.where(k_eq_n, ones, n - k) + safe_dn = array_ops.where_v2(k_eq_n, ones, n - k) dk = math_ops.betainc(a=safe_dn, b=k + 1, x=1 - p) - return array_ops.where(k_eq_n, ones, dk) + return array_ops.where_v2(k_eq_n, ones, dk) class Binomial(distribution.Distribution): From fd4c0c0a784febc1b292c0b254638ff2e98975ce Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 9 Jul 2019 08:51:37 +0530 Subject: [PATCH 0267/3053] Removed Depricated API from the file. --- tensorflow/contrib/distributions/python/ops/wishart.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py index a5bb880bed9..56c9704b8df 100644 --- a/tensorflow/contrib/distributions/python/ops/wishart.py +++ b/tensorflow/contrib/distributions/python/ops/wishart.py @@ -400,7 +400,7 @@ class _WishartLinearOperator(distribution.Distribution): def _mode(self): s = self.df - self.dimension - 1. - s = array_ops.where( + s = array_ops.where_v2( math_ops.less(s, 0.), constant_op.constant(float("NaN"), dtype=self.dtype, name="nan"), s) From a962dc7d2fddac2632acf624b86e2792f6d59dde Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 9 Jul 2019 08:54:36 +0530 Subject: [PATCH 0268/3053] Removed Depricated API from the file. --- tensorflow/contrib/distributions/python/ops/batch_reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/distributions/python/ops/batch_reshape.py b/tensorflow/contrib/distributions/python/ops/batch_reshape.py index d4503790888..eb4b96835d2 100644 --- a/tensorflow/contrib/distributions/python/ops/batch_reshape.py +++ b/tensorflow/contrib/distributions/python/ops/batch_reshape.py @@ -381,7 +381,7 @@ def calculate_reshape(original_shape, new_shape, validate=False, name=None): size_implicit_dim = ( original_size // math_ops.maximum(1, -math_ops.reduce_prod(new_shape))) new_ndims = array_ops.shape(new_shape) - expanded_new_shape = array_ops.where( # Assumes exactly one `-1`. + expanded_new_shape = array_ops.where_v2( # Assumes exactly one `-1`. implicit_dim, array_ops.fill(new_ndims, size_implicit_dim), new_shape) validations = [] if not validate else [ check_ops.assert_rank( From e0ad74f36be9af82b98da1c886171ff4f62dc0ed Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 9 Jul 2019 08:57:06 +0530 Subject: [PATCH 0269/3053] Removed Depricated API from the file. --- .../contrib/distributions/python/ops/negative_binomial.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/distributions/python/ops/negative_binomial.py b/tensorflow/contrib/distributions/python/ops/negative_binomial.py index 6acfc5746a0..229603c38a8 100644 --- a/tensorflow/contrib/distributions/python/ops/negative_binomial.py +++ b/tensorflow/contrib/distributions/python/ops/negative_binomial.py @@ -190,7 +190,7 @@ class NegativeBinomial(distribution.Distribution): return self.total_count * math_ops.exp(self.logits) def _mode(self): - adjusted_count = array_ops.where( + adjusted_count = array_ops.where_v2( 1. < self.total_count, self.total_count - 1., array_ops.zeros_like(self.total_count)) From 276f7fee2ce9bc635fdd910143d6f1bc5a12c943 Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 9 Jul 2019 08:58:57 +0530 Subject: [PATCH 0270/3053] Removed Depricated API from the file. --- .../contrib/distributions/python/ops/bijectors/sinh_arcsinh.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py index 241fba2cb7e..aee3a603d2b 100644 --- a/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py +++ b/tensorflow/contrib/distributions/python/ops/bijectors/sinh_arcsinh.py @@ -43,7 +43,7 @@ __all__ = [ warn_once=True) def _sqrtx2p1(x): """Implementation of `sqrt(1 + x**2)` which is stable despite large `x`.""" - return array_ops.where( + return array_ops.where_v2( math_ops.abs(x) * np.sqrt(np.finfo(x.dtype.as_numpy_dtype).eps) <= 1., math_ops.sqrt(x**2. + 1.), # For large x, calculating x**2 can overflow. This can be alleviated by From 17b7d69ad4bfe3e51c4cee2a10fa24bd9048ec27 Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 9 Jul 2019 09:03:24 +0530 Subject: [PATCH 0271/3053] Removed Depricated API from the file. --- .../contrib/distributions/python/ops/vector_diffeomixture.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py index f9748466c2e..b39dba7db6a 100644 --- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py +++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py @@ -1060,5 +1060,5 @@ def softmax(x, axis, name=None): if axis_ is not None: axis = np.int(ndims + axis_ if axis_ < 0 else axis_) else: - axis = array_ops.where(axis < 0, ndims + axis, axis) + axis = array_ops.where_v2(axis < 0, ndims + axis, axis) return nn_ops.softmax(x, axis=axis) From 43813c00a1f93db1a0fa91278330e1ceaa990535 Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 9 Jul 2019 09:05:33 +0530 Subject: [PATCH 0272/3053] Removed Depricated API from the file. --- tensorflow/contrib/distributions/python/ops/shape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/distributions/python/ops/shape.py b/tensorflow/contrib/distributions/python/ops/shape.py index 19d88d5ab5d..58f09094db3 100644 --- a/tensorflow/contrib/distributions/python/ops/shape.py +++ b/tensorflow/contrib/distributions/python/ops/shape.py @@ -457,7 +457,7 @@ class _DistributionShape(object): batch_shape = s[1:1+self.batch_ndims] # Since sample_dims=1 and is left-most, we add 1 to the number of # batch_ndims to get the event start dim. - event_start = array_ops.where( + event_start = array_ops.where_v2( math_ops.logical_and(expand_batch_dim, self._batch_ndims_is_0), 2, 1 + self.batch_ndims) event_shape = s[event_start:event_start+self.event_ndims] From 17005efa46f744be1cd1521f07e0cb70f65ae0c7 Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 9 Jul 2019 09:08:00 +0530 Subject: [PATCH 0273/3053] Removed Depricated API from the file. --- .../contrib/distributions/python/ops/distribution_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py index 85692d271b6..b27193b1b27 100644 --- a/tensorflow/contrib/distributions/python/ops/distribution_util.py +++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py @@ -475,7 +475,7 @@ def pad_mixture_dimensions(x, mixture_distribution, categorical_distribution, return array_ops.shape(d.batch_shape_tensor())[0] dist_batch_ndims = _get_ndims(mixture_distribution) cat_batch_ndims = _get_ndims(categorical_distribution) - pad_ndims = array_ops.where( + pad_ndims = array_ops.where_v2( categorical_distribution.is_scalar_batch(), dist_batch_ndims, dist_batch_ndims - cat_batch_ndims) From 6bd476cf7ea8b2b8ed632512541d7437af474545 Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 9 Jul 2019 09:15:30 +0530 Subject: [PATCH 0274/3053] Removed Depricated API from the file. --- tensorflow/contrib/image/python/ops/image_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py index 05ba9155c40..96f6af2ac51 100644 --- a/tensorflow/contrib/image/python/ops/image_ops.py +++ b/tensorflow/contrib/image/python/ops/image_ops.py @@ -506,7 +506,7 @@ def connected_components(images): # constructing multiple additional large tensors. components_flat = array_ops.reshape(components, [-1]) unique_ids, id_index = array_ops.unique(components_flat) - id_is_zero = array_ops.where(math_ops.equal(unique_ids, 0))[:, 0] + id_is_zero = array_ops.where_v2(math_ops.equal(unique_ids, 0))[:, 0] # Map each nonzero id to consecutive values. nonzero_consecutive_ids = math_ops.range( array_ops.shape(unique_ids)[0] - array_ops.shape(id_is_zero)[0]) + 1 From 7b78999164cad53f797aa3043c469a1fa676ebea Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 9 Jul 2019 09:20:30 +0530 Subject: [PATCH 0275/3053] Removed Depricated API from the file. --- .../contrib/factorization/python/ops/factorization_ops.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops.py b/tensorflow/contrib/factorization/python/ops/factorization_ops.py index 5c55f7f597b..7e06084b752 100644 --- a/tensorflow/contrib/factorization/python/ops/factorization_ops.py +++ b/tensorflow/contrib/factorization/python/ops/factorization_ops.py @@ -641,9 +641,9 @@ class WALSModel(object): extras = size % num_shards assignments = math_ops.maximum(ids // (ids_per_shard + 1), (ids - extras) // ids_per_shard) - new_ids = array_ops.where(assignments < extras, - ids % (ids_per_shard + 1), - (ids - extras) % ids_per_shard) + new_ids = array_ops.where_v2(assignments < extras, + ids % (ids_per_shard + 1), + (ids - extras) % ids_per_shard) return assignments, new_ids return func From a8008e160614fcf7052bf5562ec80007eb97e639 Mon Sep 17 00:00:00 2001 From: Chao Mei Date: Mon, 22 Jul 2019 01:41:01 -0700 Subject: [PATCH 0276/3053] Cleanup: changed the naming of member variables (i.e. adding "_" suffix) to be consistent. PiperOrigin-RevId: 259279731 --- .../lite/tools/benchmark/benchmark_test.cc | 2 +- .../tools/benchmark/benchmark_tflite_model.cc | 74 +++++++++---------- .../tools/benchmark/benchmark_tflite_model.h | 6 +- 3 files changed, 41 insertions(+), 41 deletions(-) diff --git a/tensorflow/lite/tools/benchmark/benchmark_test.cc b/tensorflow/lite/tools/benchmark/benchmark_test.cc index 563bf9e6eef..5d94d86d855 100644 --- a/tensorflow/lite/tools/benchmark/benchmark_test.cc +++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc @@ -61,7 +61,7 @@ class TestBenchmark : public BenchmarkTfLiteModel { public: explicit TestBenchmark(BenchmarkParams params) : BenchmarkTfLiteModel(std::move(params)) {} - const tflite::Interpreter* GetInterpreter() { return interpreter.get(); } + const tflite::Interpreter* GetInterpreter() { return interpreter_.get(); } void Prepare() { PrepareInputData(); diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc index e527796664f..0035a0b4373 100644 --- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc +++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc @@ -318,27 +318,27 @@ bool BenchmarkTfLiteModel::ValidateParams() { } return PopulateInputLayerInfo(params_.Get("input_layer"), params_.Get("input_layer_shape"), - &inputs); + &inputs_); } uint64_t BenchmarkTfLiteModel::ComputeInputBytes() { - TFLITE_BENCHMARK_CHECK(interpreter); + TFLITE_BENCHMARK_CHECK(interpreter_); uint64_t total_input_bytes = 0; - for (int input : interpreter->inputs()) { - auto* t = interpreter->tensor(input); + for (int input : interpreter_->inputs()) { + auto* t = interpreter_->tensor(input); total_input_bytes += t->bytes; } return total_input_bytes; } void BenchmarkTfLiteModel::PrepareInputData() { - auto interpreter_inputs = interpreter->inputs(); + auto interpreter_inputs = interpreter_->inputs(); const size_t input_size = interpreter_inputs.size(); CleanUp(); for (int j = 0; j < input_size; ++j) { int i = interpreter_inputs[j]; - TfLiteTensor* t = interpreter->tensor(i); + TfLiteTensor* t = interpreter_->tensor(i); std::vector sizes = TfLiteIntArrayToVector(t->dims); int num_elements = 1; for (int i = 0; i < sizes.size(); ++i) { @@ -388,25 +388,25 @@ void BenchmarkTfLiteModel::PrepareInputData() { } void BenchmarkTfLiteModel::ResetInputsAndOutputs() { - auto interpreter_inputs = interpreter->inputs(); + auto interpreter_inputs = interpreter_->inputs(); // Set the values of the input tensors from inputs_data_. for (int j = 0; j < interpreter_inputs.size(); ++j) { int i = interpreter_inputs[j]; - TfLiteTensor* t = interpreter->tensor(i); + TfLiteTensor* t = interpreter_->tensor(i); if (t->type == kTfLiteFloat32) { - std::memcpy(interpreter->typed_tensor(i), inputs_data_[j].data.f, + std::memcpy(interpreter_->typed_tensor(i), inputs_data_[j].data.f, inputs_data_[j].bytes); } else if (t->type == kTfLiteInt32) { - std::memcpy(interpreter->typed_tensor(i), + std::memcpy(interpreter_->typed_tensor(i), inputs_data_[j].data.i32, inputs_data_[j].bytes); } else if (t->type == kTfLiteInt16) { - std::memcpy(interpreter->typed_tensor(i), + std::memcpy(interpreter_->typed_tensor(i), inputs_data_[j].data.i16, inputs_data_[j].bytes); } else if (t->type == kTfLiteUInt8) { - std::memcpy(interpreter->typed_tensor(i), + std::memcpy(interpreter_->typed_tensor(i), inputs_data_[j].data.uint8, inputs_data_[j].bytes); } else if (t->type == kTfLiteInt8) { - std::memcpy(interpreter->typed_tensor(i), + std::memcpy(interpreter_->typed_tensor(i), inputs_data_[j].data.int8, inputs_data_[j].bytes); } else if (t->type == kTfLiteString) { tflite::DynamicBuffer buffer; @@ -414,7 +414,7 @@ void BenchmarkTfLiteModel::ResetInputsAndOutputs() { FillRandomString(&buffer, sizes, []() { return "we're have some friends over saturday to hang out in the yard"; }); - buffer.WriteToTensor(interpreter->tensor(i), /*new_shape=*/nullptr); + buffer.WriteToTensor(interpreter_->tensor(i), /*new_shape=*/nullptr); } else { TFLITE_LOG(FATAL) << "Don't know how to populate tensor " << t->name << " of type " << t->type; @@ -424,27 +424,27 @@ void BenchmarkTfLiteModel::ResetInputsAndOutputs() { void BenchmarkTfLiteModel::Init() { std::string graph = params_.Get("graph"); - model = tflite::FlatBufferModel::BuildFromFile(graph.c_str()); - if (!model) { + model_ = tflite::FlatBufferModel::BuildFromFile(graph.c_str()); + if (!model_) { TFLITE_LOG(FATAL) << "Failed to mmap model " << graph; } TFLITE_LOG(INFO) << "Loaded model " << graph; - model->error_reporter(); + model_->error_reporter(); TFLITE_LOG(INFO) << "resolved reporter"; auto resolver = GetOpResolver(); const int32_t num_threads = params_.Get("num_threads"); - tflite::InterpreterBuilder(*model, *resolver)(&interpreter, num_threads); - if (!interpreter) { + tflite::InterpreterBuilder(*model_, *resolver)(&interpreter_, num_threads); + if (!interpreter_) { TFLITE_LOG(FATAL) << "Failed to construct interpreter"; } - interpreter->UseNNAPI(params_.Get("use_legacy_nnapi")); + interpreter_->UseNNAPI(params_.Get("use_legacy_nnapi")); delegates_ = GetDelegates(); for (const auto& delegate : delegates_) { - if (interpreter->ModifyGraphWithDelegate(delegate.second.get()) != + if (interpreter_->ModifyGraphWithDelegate(delegate.second.get()) != kTfLiteOk) { TFLITE_LOG(FATAL) << "Failed to apply " << delegate.first << " delegate."; } else { @@ -452,23 +452,23 @@ void BenchmarkTfLiteModel::Init() { } } - interpreter->SetAllowFp16PrecisionForFp32(params_.Get("allow_fp16")); + interpreter_->SetAllowFp16PrecisionForFp32(params_.Get("allow_fp16")); - auto interpreter_inputs = interpreter->inputs(); + auto interpreter_inputs = interpreter_->inputs(); - if (!inputs.empty()) { - TFLITE_BENCHMARK_CHECK_EQ(inputs.size(), interpreter_inputs.size()) + if (!inputs_.empty()) { + TFLITE_BENCHMARK_CHECK_EQ(inputs_.size(), interpreter_inputs.size()) << "Inputs mismatch: Model inputs #:" << interpreter_inputs.size() - << " expected: " << inputs.size(); + << " expected: " << inputs_.size(); } // Check if the tensor names match, and log a warning if it doesn't. // TODO(ycling): Consider to make this an error again when the new converter // create tensors with consistent naming. - for (int j = 0; j < inputs.size(); ++j) { - const InputLayerInfo& input = inputs[j]; + for (int j = 0; j < inputs_.size(); ++j) { + const InputLayerInfo& input = inputs_[j]; int i = interpreter_inputs[j]; - TfLiteTensor* t = interpreter->tensor(i); + TfLiteTensor* t = interpreter_->tensor(i); if (input.name != t->name) { TFLITE_LOG(WARN) << "Tensor # " << i << " is named " << t->name << " but flags call it " << input.name; @@ -476,23 +476,23 @@ void BenchmarkTfLiteModel::Init() { } // Resize all non-string tensors. - for (int j = 0; j < inputs.size(); ++j) { - const InputLayerInfo& input = inputs[j]; + for (int j = 0; j < inputs_.size(); ++j) { + const InputLayerInfo& input = inputs_[j]; int i = interpreter_inputs[j]; - TfLiteTensor* t = interpreter->tensor(i); + TfLiteTensor* t = interpreter_->tensor(i); if (t->type != kTfLiteString) { - interpreter->ResizeInputTensor(i, input.shape); + interpreter_->ResizeInputTensor(i, input.shape); } } - if (interpreter->AllocateTensors() != kTfLiteOk) { + if (interpreter_->AllocateTensors() != kTfLiteOk) { TFLITE_LOG(FATAL) << "Failed to allocate tensors!"; } // Install profilers if necessary. if (params_.Get("enable_op_profiling")) { profiling_listener_.reset(new ProfilingListener( - interpreter.get(), + interpreter_.get(), params_.Get("max_profiling_buffer_entries"))); AddListener(profiling_listener_.get()); } @@ -507,7 +507,7 @@ BenchmarkTfLiteModel::TfLiteDelegatePtrMap BenchmarkTfLiteModel::GetDelegates() TfLiteDelegatePtrMap delegates; if (params_.Get("use_gpu")) { Interpreter::TfLiteDelegatePtr delegate = - evaluation::CreateGPUDelegate(model.get()); + evaluation::CreateGPUDelegate(model_.get()); if (!delegate) { TFLITE_LOG(WARN) << "GPU acceleration is unsupported on this platform."; } else { @@ -551,7 +551,7 @@ std::unique_ptr BenchmarkTfLiteModel::GetOpResolver() } void BenchmarkTfLiteModel::RunImpl() { - if (interpreter->Invoke() != kTfLiteOk) { + if (interpreter_->Invoke() != kTfLiteOk) { TFLITE_LOG(FATAL) << "Failed to invoke!"; } } diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h index 04d190531b8..79b59474235 100644 --- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h +++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h @@ -62,15 +62,15 @@ class BenchmarkTfLiteModel : public BenchmarkModel { void CleanUp(); - std::unique_ptr model; - std::unique_ptr interpreter; + std::unique_ptr model_; + std::unique_ptr interpreter_; private: struct InputTensorData { TfLitePtrUnion data; size_t bytes; }; - std::vector inputs; + std::vector inputs_; std::vector inputs_data_; std::unique_ptr profiling_listener_; std::unique_ptr gemmlowp_profiling_listener_; From 2e8fdd03beb46ea9aafdb7fe4d7f114707384890 Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 9 Jul 2019 09:27:55 +0530 Subject: [PATCH 0277/3053] Removed Depricated API from the file. --- tensorflow/contrib/training/python/training/bucket_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/training/python/training/bucket_ops.py b/tensorflow/contrib/training/python/training/bucket_ops.py index 10f3f88f3eb..b0398e3d3f3 100644 --- a/tensorflow/contrib/training/python/training/bucket_ops.py +++ b/tensorflow/contrib/training/python/training/bucket_ops.py @@ -399,7 +399,7 @@ def bucket_by_sequence_length(input_length, conditions_c = math_ops.logical_and( math_ops.less_equal(buckets_min, input_length), math_ops.less(input_length, buckets_max)) - which_bucket = math_ops.reduce_min(array_ops.where(conditions_c)) + which_bucket = math_ops.reduce_min(array_ops.where_v2(conditions_c)) which_bucket = math_ops.cast(which_bucket, dtypes.int32) if shapes is not None: From eb7b74d301ba3ffb97d5bbe8e350714497544add Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 9 Jul 2019 09:29:12 +0530 Subject: [PATCH 0278/3053] Removed Depricated API from the file. --- tensorflow/contrib/training/python/training/sampling_ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/training/python/training/sampling_ops.py b/tensorflow/contrib/training/python/training/sampling_ops.py index 849b77d6095..257cc4fce21 100644 --- a/tensorflow/contrib/training/python/training/sampling_ops.py +++ b/tensorflow/contrib/training/python/training/sampling_ops.py @@ -417,7 +417,7 @@ def _calculate_acceptance_probabilities(init_probs, target_probs): ratio_l = target_probs / init_probs # Replace NaNs with 0s. - ratio_l = array_ops.where( + ratio_l = array_ops.where_v2( math_ops.is_nan(ratio_l), array_ops.zeros_like(ratio_l), ratio_l) # Calculate list of acceptance probabilities. From 616b8b596346e10b74a370bfdb534bb3bb24c4df Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Tue, 9 Jul 2019 09:31:51 +0530 Subject: [PATCH 0279/3053] Removed Depricated API from the file. --- .../training/python/training/sequence_queueing_state_saver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py index e44c4f8c0ef..02baf4e071e 100644 --- a/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py +++ b/tensorflow/contrib/training/python/training/sequence_queueing_state_saver.py @@ -594,7 +594,7 @@ class NextQueuedSequenceBatch(object): # unless we explicitly tie them to CPU. with ops.colocate_with(self._state_saver._capacity_queue.queue_ref): indices_where_not_done = array_ops.reshape( - array_ops.where( + array_ops.where_v2( math_ops.logical_not(self._state_saver._sequence_is_done)), [-1]) keeping_next_key = array_ops.gather( From c7885688c2ddbe81f77ddf56613d383728af5282 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 02:02:19 -0700 Subject: [PATCH 0280/3053] compat: Update forward compatibility horizon to 2019-07-22 PiperOrigin-RevId: 259282362 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index 128253b357e..bb236f1142e 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export # This value changes every day with an automatic CL. It can be modified in code # via `forward_compatibility_horizon()` or with the environment variable # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date. -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 7, 21) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 7, 22) _FORWARD_COMPATIBILITY_HORIZON_OVERRIDDEN = False _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS" From 2d254ed531b0b18fb9a998f2484691da2925f6d6 Mon Sep 17 00:00:00 2001 From: Koan-Sin Tan Date: Mon, 22 Jul 2019 17:12:57 +0800 Subject: [PATCH 0281/3053] [tflite] fix a typo in tools evaluation doc a trivial error becasue of copy & paste? --- .../evaluation/tasks/imagenet_image_classification/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md index 07b9b187b16..382719f012d 100644 --- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md +++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/README.md @@ -191,7 +191,7 @@ adb push ${MODEL_LABELS_TXT} /data/local/tmp/model_output_labels.txt (8) Run the binary. ``` -adb shell /data/local/tmp/imagenet_accuracy_eval \ +adb shell /data/local/tmp/run_eval \ --model_file=/data/local/tmp/mobilenet_quant_v1_224.tflite \ --ground_truth_images_path=/data/local/tmp/ilsvrc_images \ --ground_truth_labels=/data/local/tmp/ilsvrc_validation_labels.txt \ From 1ee51a3b868a3ccd5f80724f6b9389fd0a9aed07 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 02:02:19 -0700 Subject: [PATCH 0282/3053] Update GraphDef version to 104. PiperOrigin-RevId: 259282364 --- tensorflow/core/public/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h index dcf8c974a63..a01653124b2 100644 --- a/tensorflow/core/public/version.h +++ b/tensorflow/core/public/version.h @@ -108,7 +108,7 @@ limitations under the License. #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0 -#define TF_GRAPH_DEF_VERSION 103 // Updated: 2019/7/21 +#define TF_GRAPH_DEF_VERSION 104 // Updated: 2019/7/22 // Checkpoint compatibility versions (the versions field in SavedSliceMeta). // From c1005c4a73e4c2328662bfc203d4528bf4164fce Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Mon, 22 Jul 2019 15:02:22 +0530 Subject: [PATCH 0283/3053] Removed the deprecated API from contrib module --- .../boosted_trees/python/kernel_tests/training_ops_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py index 86fd5770a03..74a51f4e4d8 100644 --- a/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py +++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/training_ops_test.py @@ -142,7 +142,8 @@ def _gen_categorical_split_info(fc, feat_id, left_weight, right_weight): def _get_bias_update(grads, hess): - return array_ops.where(hess > 0, -grads / hess, array_ops.zeros_like(grads)) + return array_ops.where_v2(hess > 0, -grads / hess, + array_ops.zeros_like(grads)) class CenterTreeEnsembleBiasOpTest(test_util.TensorFlowTestCase): From 2f3a71a6bebdc8bbb6962202c5392569ee2a187b Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Mon, 22 Jul 2019 15:02:36 +0530 Subject: [PATCH 0284/3053] Removed the deprecated API from contrib module --- .../contrib/distributions/python/ops/vector_diffeomixture.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py index f9748466c2e..b39dba7db6a 100644 --- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py +++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py @@ -1060,5 +1060,5 @@ def softmax(x, axis, name=None): if axis_ is not None: axis = np.int(ndims + axis_ if axis_ < 0 else axis_) else: - axis = array_ops.where(axis < 0, ndims + axis, axis) + axis = array_ops.where_v2(axis < 0, ndims + axis, axis) return nn_ops.softmax(x, axis=axis) From 808a6593d1f767fe095b0f8c59597bce5103557a Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Mon, 22 Jul 2019 15:02:47 +0530 Subject: [PATCH 0285/3053] Removed the deprecated API from contrib module --- .../contrib/gan/python/eval/python/classifier_metrics_impl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py index 2c301267900..7c88a7b611a 100644 --- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py +++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py @@ -108,7 +108,7 @@ def _symmetric_matrix_square_root(mat, eps=1e-10): # Unlike numpy, tensorflow's return order is (s, u, v) s, u, v = linalg_ops.svd(mat) # sqrt is unstable around 0, just use 0 in such case - si = array_ops.where(math_ops.less(s, eps), s, math_ops.sqrt(s)) + si = array_ops.where_v2(math_ops.less(s, eps), s, math_ops.sqrt(s)) # Note that the v returned by Tensorflow is v = V # (when referencing the equation A = U S V^T) # This is unlike Numpy which returns v = V^T From 0e8f91e8d2faf806babcf8ab4db7f6d1c7040698 Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Mon, 22 Jul 2019 15:03:11 +0530 Subject: [PATCH 0286/3053] Removed the deprecated API from contrib module --- .../seq2seq/python/kernel_tests/beam_search_decoder_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py index 6360d1cfdc1..343e5f4be69 100644 --- a/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py +++ b/tensorflow/contrib/seq2seq/python/kernel_tests/beam_search_decoder_test.py @@ -407,8 +407,8 @@ class TestLargeBeamStep(test.TestCase): log_prob_neg_inf = array_ops.ones( [self.batch_size, self.beam_width], dtype=dtypes.float32) * -np.Inf - log_probs = array_ops.where(log_prob_mask, log_prob_zeros, - log_prob_neg_inf) + log_probs = array_ops.where_v2(log_prob_mask, log_prob_zeros, + log_prob_neg_inf) return log_probs log_probs = get_probs() From e932cb35ed586ad202afe1b299391f096db1721b Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Mon, 22 Jul 2019 15:03:38 +0530 Subject: [PATCH 0287/3053] Removed the deprecated API from contrib module --- tensorflow/contrib/slim/python/slim/learning_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/slim/python/slim/learning_test.py b/tensorflow/contrib/slim/python/slim/learning_test.py index 5db4fe02b8e..aefc07696b9 100644 --- a/tensorflow/contrib/slim/python/slim/learning_test.py +++ b/tensorflow/contrib/slim/python/slim/learning_test.py @@ -197,7 +197,8 @@ class MultiplyGradientsTest(test.TestCase): gradient = constant_op.constant(self._grad_vec, dtype=dtypes.float32) variable = variables_lib.Variable(array_ops.zeros_like(gradient)) multiplier_flag = variables_lib.Variable(True) - tensor_multiplier = array_ops.where(multiplier_flag, self._multiplier, 1.0) + tensor_multiplier = array_ops.where_v2(multiplier_flag, self._multiplier, + 1.0) grad_to_var = (gradient, variable) gradient_multipliers = {variable: tensor_multiplier} From 9074c4c150b17b19f187efd5a1a3b8bd5f6ed975 Mon Sep 17 00:00:00 2001 From: Amit Srivastava Date: Mon, 22 Jul 2019 15:03:51 +0530 Subject: [PATCH 0288/3053] Removed the deprecated API from contrib module --- tensorflow/contrib/tensor_forest/python/tensor_forest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py index df10997d633..ddeff8dc9af 100644 --- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py +++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py @@ -461,7 +461,8 @@ class RandomForestGraphs(object): mask = math_ops.less( r, array_ops.ones_like(r) * self.params.bagging_fraction) - gather_indices = array_ops.squeeze(array_ops.where(mask), axis=[1]) + gather_indices = array_ops.squeeze(array_ops.where_v2(mask), + axis=[1]) # TODO(thomaswc): Calculate out-of-bag data and labels, and store # them for use in calculating statistics later. tree_data = array_ops.gather(processed_dense_features, gather_indices) From 384e7f8c86509e9c0a1319ebf2f895a8abb27f76 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 06:03:34 -0700 Subject: [PATCH 0289/3053] [XLA:Python] Refactor Python specifics out of PyLocalClient and PyLocalBuffer to remove dependency on pybind11. PiperOrigin-RevId: 259312456 --- tensorflow/compiler/xla/python/BUILD | 1 - .../compiler/xla/python/local_client.cc | 113 ++++++------------ tensorflow/compiler/xla/python/local_client.h | 23 ++-- tensorflow/compiler/xla/python/xla.cc | 73 +++++++++-- 4 files changed, 117 insertions(+), 93 deletions(-) diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD index 0e66e99faeb..fbcaa6f9fc3 100644 --- a/tensorflow/compiler/xla/python/BUILD +++ b/tensorflow/compiler/xla/python/BUILD @@ -69,7 +69,6 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", - "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/types:optional", "@pybind11", diff --git a/tensorflow/compiler/xla/python/local_client.cc b/tensorflow/compiler/xla/python/local_client.cc index 982bf9eb21f..b6d44ef011e 100644 --- a/tensorflow/compiler/xla/python/local_client.cc +++ b/tensorflow/compiler/xla/python/local_client.cc @@ -85,7 +85,6 @@ limitations under the License. #include "absl/strings/str_format.h" #include "absl/synchronization/mutex.h" #include "absl/time/time.h" -#include "include/pybind11/pybind11.h" #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/executable_run_options.h" @@ -106,8 +105,6 @@ limitations under the License. namespace xla { -namespace py = pybind11; - static StatusOr> CreateBFCAllocator( se::Platform* platform, LocalClient* client, double memory_fraction, bool preallocate) { @@ -222,47 +219,21 @@ PyLocalClient::PyLocalClient( Status PyLocalClient::TransferToInfeed(const LiteralSlice& literal, int device_ordinal) { - py_ref_manager().CollectGarbage(); - py::gil_scoped_release gil_release; return client_->TransferToInfeedLocal(literal, device_ordinal); } -StatusOr PyLocalClient::TransferFromOutfeed( - const Shape& shape, int device_ordinal) { - py_ref_manager().CollectGarbage(); - Literal literal; - { - py::gil_scoped_release gil_release; - TF_ASSIGN_OR_RETURN( - literal, client_->TransferFromOutfeedLocal(shape, device_ordinal)); - } - return LiteralToPython(std::make_shared(std::move(literal))); +StatusOr PyLocalClient::TransferFromOutfeed(const Shape& shape, + int device_ordinal) { + return client_->TransferFromOutfeedLocal(shape, device_ordinal); } /* static */ -StatusOr> PyLocalBuffer::FromPython( - const py::object& argument, std::shared_ptr client, - int device_ordinal) { - tensorflow::profiler::TraceMe traceme("PyLocalBuffer::FromPython"); - struct H2DTransfer { - PythonBufferTree tree; - std::shared_ptr py_buffer_ref; - }; - auto transfer = std::make_shared(); - TF_ASSIGN_OR_RETURN(transfer->tree, GetPythonBufferTree(argument)); - - client->py_ref_manager().CollectGarbage(); - - // Take a reference to the buffer to ensure that the inputs in host memory - // remain live until the transfer is complete. - transfer->py_buffer_ref = client->py_ref_manager().ManageReferences( - absl::MakeSpan(transfer->tree.arrays)); - transfer->tree.arrays.clear(); - - // We are done manipulating Python objects; release the GIL. - py::gil_scoped_release gil_release; - VLOG(1) << "PyLocalBuffer::FromPython: shape: " - << transfer->tree.shape.ToString() +StatusOr> PyLocalBuffer::FromLiterals( + std::vector leaves_literals, const Shape& tuple_shape, + std::shared_ptr leaves_reference, + std::shared_ptr client, int device_ordinal) { + tensorflow::profiler::TraceMe traceme("PyLocalBuffer::FromLiterals"); + VLOG(1) << "PyLocalBuffer::FromLiterals: shape: " << tuple_shape.ToString() << " device ordinal: " << device_ordinal; Device* device = &client->device(device_ordinal); @@ -270,11 +241,11 @@ StatusOr> PyLocalBuffer::FromPython( client->client()->backend().transfer_manager(); se::DeviceMemoryAllocator* allocator = client->allocator(); TF_ASSIGN_OR_RETURN( - transfer->tree.shape, - transfer_manager->ChooseCompactLayoutForShape(transfer->tree.shape)); + Shape compact_shape, + transfer_manager->ChooseCompactLayoutForShape(tuple_shape)); TF_ASSIGN_OR_RETURN(ScopedShapedBuffer scoped_buffer, transfer_manager->AllocateScopedShapedBuffer( - transfer->tree.shape, allocator, device_ordinal)); + compact_shape, allocator, device_ordinal)); // Make the host to device stream wait for the newly allocated buffer to be // available on the compute stream. We schedule this wait synchronously; while @@ -293,21 +264,25 @@ StatusOr> PyLocalBuffer::FromPython( SharedDeviceBuffer::FromScopedShapedBuffer(std::move(scoped_buffer), definition_event); + // TODO(makro): Use move capture once C++ 14 features are available. + auto leaves = std::make_shared>( + std::move(leaves_literals)); auto transfer_h2d = [client, transfer_manager, device, device_ordinal, - device_buffer, transfer]() { + device_buffer, compact_shape, leaves, + leaves_reference]() { // This function uses TF_CHECK_OK and ValueOrDie() since we have no way to // report failures from a callback. However, the operations here are // unlikely to fail and not recoverable even if we were to fail: DMAs to // memory that has already been allocated, and a possible Event allocation. - ShapedBuffer buffer = device_buffer->AsShapedBuffer(transfer->tree.shape); + ShapedBuffer buffer = device_buffer->AsShapedBuffer(compact_shape); TF_CHECK_OK(transfer_manager->WriteTupleIndexTablesAsync( device->host_to_device_stream(), buffer)); std::vector> staging_buffers; - staging_buffers.reserve(transfer->tree.leaves.size()); - auto it = transfer->tree.leaves.begin(); + staging_buffers.reserve(leaves->size()); + auto it = leaves->begin(); for (const ShapeUtil::IndexedShape& indexed_shape : - ShapeUtil::GetLeafShapes(transfer->tree.shape)) { - CHECK(it != transfer->tree.leaves.end()); + ShapeUtil::GetLeafShapes(compact_shape)) { + CHECK(it != leaves->end()); ShapedBuffer leaf( indexed_shape.shape, transfer_manager->HostShapeToDeviceShape(indexed_shape.shape), @@ -352,19 +327,19 @@ StatusOr> PyLocalBuffer::FromPython( device->ThenRelease(device->host_to_device_stream(), device_buffer); } - device->ThenRelease(device->host_to_device_stream(), - std::make_pair(std::move(transfer->py_buffer_ref), - std::move(staging_buffers))); + device->ThenRelease( + device->host_to_device_stream(), + std::make_pair(leaves_reference, std::move(staging_buffers))); }; client->h2d_transfer_pool()->Schedule(transfer_h2d); return absl::make_unique( - transfer->tree.shape, std::move(device_buffer), std::move(client)); + compact_shape, std::move(device_buffer), std::move(client)); } /* static */ StatusOr> PyLocalBuffer::MakeTuple( const std::vector buffers, std::shared_ptr client, int device_ordinal) { - std::vector host_shapes; + std::vector host_shapes; std::vector> device_buffers; host_shapes.reserve(buffers.size()); device_buffers.reserve(buffers.size()); @@ -458,29 +433,22 @@ Status PyLocalBuffer::CopyToHostAsync() { return Status::OK(); } -StatusOr PyLocalBuffer::ToPython() { - tensorflow::profiler::TraceMe traceme("PyLocalBuffer::ToPython"); +StatusOr> PyLocalBuffer::ToLiteral() { + tensorflow::profiler::TraceMe traceme("PyLocalBuffer::ToLiteral"); std::shared_ptr device_buffer = DeviceBuffer(); if (!device_buffer) { - return InvalidArgument("ToPython() called on invalid buffer."); + return InvalidArgument("ToLiteral() called on invalid buffer."); } - client_->py_ref_manager().CollectGarbage(); - std::shared_ptr literal; + TF_RETURN_IF_ERROR(CopyToHostAsync()); + std::shared_ptr host_value; { - py::gil_scoped_release gil_release; - TF_RETURN_IF_ERROR(CopyToHostAsync()); - std::shared_ptr host_value; - { - absl::MutexLock lock(&mu_); - host_value = host_value_; - } - host_value->ready.WaitForNotification(); - TF_RETURN_IF_ERROR(host_value->status); - literal = host_value->value; + absl::MutexLock lock(&mu_); + host_value = host_value_; } - - return LiteralToPython(std::move(literal)); + host_value->ready.WaitForNotification(); + TF_RETURN_IF_ERROR(host_value->status); + return host_value->value; } std::shared_ptr PyLocalBuffer::DeviceBuffer() const { @@ -524,8 +492,6 @@ PyLocalBuffer::DestructureTuple() { StatusOr> PyLocalBuffer::CopyToDevice( int dst_device_ordinal) { tensorflow::profiler::TraceMe traceme("PyLocalBuffer::CopyToDevice"); - client_->py_ref_manager().CollectGarbage(); - py::gil_scoped_release gil_release; std::shared_ptr src_device_buffer = DeviceBuffer(); if (dst_device_ordinal == device_ordinal_) { return absl::make_unique(on_host_shape_, src_device_buffer, @@ -554,7 +520,7 @@ StatusOr> PyLocalBuffer::CopyToDevice( // Copy the leaf buffers. for (const auto& leaf : src_buffer.buffers().leaves()) { - const xla::ShapeIndex& index = leaf.first; + const ShapeIndex& index = leaf.first; const se::DeviceMemoryBase& input_buffer = leaf.second; const se::DeviceMemoryBase& output_buffer = dst_buffer.buffer(index); TF_RET_CHECK(input_buffer.size() == output_buffer.size()) @@ -603,9 +569,6 @@ Status PyLocalBuffer::BlockHostUntilReady() { return InvalidArgument("BlockHostUntilReady() called on invalid buffer."); } - client_->py_ref_manager().CollectGarbage(); - py::gil_scoped_release gil_release; - // This code waits at least until the buffer is ready, but it may wait longer // if there are other device to host transfers scheduled. If this proves to // be an issue, we could either use a separate stream for this purpose, or diff --git a/tensorflow/compiler/xla/python/local_client.h b/tensorflow/compiler/xla/python/local_client.h index 8ad4c44d53f..65e3203a258 100644 --- a/tensorflow/compiler/xla/python/local_client.h +++ b/tensorflow/compiler/xla/python/local_client.h @@ -23,7 +23,6 @@ limitations under the License. #include "absl/synchronization/mutex.h" #include "absl/synchronization/notification.h" #include "absl/types/span.h" -#include "include/pybind11/pybind11.h" #include "tensorflow/compiler/xla/client/executable_build_options.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/xla_computation.h" @@ -78,8 +77,7 @@ class PyLocalClient { virtual ~PyLocalClient() = default; Status TransferToInfeed(const LiteralSlice& literal, int device_ordinal); - StatusOr TransferFromOutfeed(const Shape& shape, - int device_ordinal); + StatusOr TransferFromOutfeed(const Shape& shape, int device_ordinal); int device_count() const { return client_->device_count(); } Device& device(int device_ordinal) const { @@ -128,9 +126,10 @@ class PyLocalClient { // Thread-safe. class PyLocalBuffer { public: - static StatusOr> FromPython( - const pybind11::object& argument, std::shared_ptr client, - int device_ordinal); + static StatusOr> FromLiterals( + std::vector leaves_literals, const Shape& tuple_shape, + std::shared_ptr leaves_reference, + std::shared_ptr client, int device_ordinal); static StatusOr> MakeTuple( const std::vector buffers, @@ -149,15 +148,19 @@ class PyLocalBuffer { const Shape& on_host_shape() const { return on_host_shape_; } int device_ordinal() const { return device_ordinal_; } + // TODO(makro): Make `client` private once `PythonRefManager` is refactored + // out of `PyLocalClient`. + PyLocalClient* client() const { return client_.get(); } + // Returns the buffer's value as a tuple DAG of Python arrays. If the value // has previously been prefetched to the host, then returns the prefetched // version, otherwise copies the buffer to the host. Blocks until the // value is ready. - StatusOr ToPython(); + StatusOr> ToLiteral(); // Initiates a copy of the buffer to the host. Does not block waiting for // the transfer to complete. The value can be retrieved by a later call to - // ToPython(). + // ToLiteral(). Status CopyToHostAsync(); // Returns the associated device buffer. Returns a nullptr if the buffer is @@ -190,14 +193,14 @@ class PyLocalBuffer { std::shared_ptr device_buffer_ GUARDED_BY(mu_); // The cached value of the buffer on the host, produced either from a call to - // CopyToHost or from a call to ToPython. Once a value has been fetched to + // CopyToHost or from a call to ToLiteral. Once a value has been fetched to // the host, it persists Delete() is called or the PyLocalBuffer is destroyed. struct HostValue { absl::Notification ready; // status and value are valid for reading only after `ready` has been // notified. Status status; - std::shared_ptr value; + std::shared_ptr value; }; std::shared_ptr host_value_ GUARDED_BY(mu_); }; diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc index 172e24f801e..6cd56b800a2 100644 --- a/tensorflow/compiler/xla/python/xla.cc +++ b/tensorflow/compiler/xla/python/xla.cc @@ -312,18 +312,77 @@ PYBIND11_MODULE(xla_extension, m) { py::arg("xla_platform_id"), py::arg("asynchronous"), py::arg("allocator_config") = AllocatorConfig()) .def("DeviceCount", &PyLocalClient::device_count) - .def("TransferToInfeed", &PyLocalClient::TransferToInfeed) - .def("TransferFromOutfeed", &PyLocalClient::TransferFromOutfeed); + .def("TransferToInfeed", + [](PyLocalClient* client, const LiteralSlice& literal, + int device_ordinal) { + client->py_ref_manager().CollectGarbage(); + py::gil_scoped_release gil_release; + return client->TransferToInfeed(literal, device_ordinal); + }) + .def("TransferFromOutfeed", + [](PyLocalClient* client, const Shape& shape, + int device_ordinal) -> StatusOr { + client->py_ref_manager().CollectGarbage(); + std::shared_ptr literal_shared; + { + py::gil_scoped_release gil_release; + TF_ASSIGN_OR_RETURN(Literal literal, client->TransferFromOutfeed( + shape, device_ordinal)); + literal_shared = std::make_shared(std::move(literal)); + } + return LiteralToPython(std::move(literal_shared)); + }); py::class_(m, "PyLocalBuffer") - .def_static("from_python", &PyLocalBuffer::FromPython) + .def_static( + "from_python", + [](const pybind11::object& argument, + std::shared_ptr client, + int device_ordinal) -> StatusOr> { + client->py_ref_manager().CollectGarbage(); + TF_ASSIGN_OR_RETURN(PythonBufferTree tree, + GetPythonBufferTree(argument)); + std::shared_ptr py_buffer_ref = + client->py_ref_manager().ManageReferences( + absl::MakeSpan(tree.arrays)); + tree.arrays.clear(); + + std::vector leaves; + leaves.insert(leaves.end(), + std::make_move_iterator(tree.leaves.begin()), + std::make_move_iterator(tree.leaves.end())); + + py::gil_scoped_release gil_release; + return PyLocalBuffer::FromLiterals( + std::move(leaves), tree.shape, std::move(py_buffer_ref), + std::move(client), device_ordinal); + }) .def_static("make_tuple", &PyLocalBuffer::MakeTuple) - .def("copy_to_device", &PyLocalBuffer::CopyToDevice) + .def("copy_to_device", + [](PyLocalBuffer* buffer, int dst_device_ordinal) { + buffer->client()->py_ref_manager().CollectGarbage(); + py::gil_scoped_release gil_release; + return buffer->CopyToDevice(dst_device_ordinal); + }) .def("delete", &PyLocalBuffer::Delete) .def("destructure", &PyLocalBuffer::DestructureTuple) - .def("block_host_until_ready", &PyLocalBuffer::BlockHostUntilReady) + .def("block_host_until_ready", + [](PyLocalBuffer* buffer) { + buffer->client()->py_ref_manager().CollectGarbage(); + py::gil_scoped_release gil_release; + return buffer->BlockHostUntilReady(); + }) .def("copy_to_host_async", &PyLocalBuffer::CopyToHostAsync) - .def("to_py", &PyLocalBuffer::ToPython) + .def("to_py", + [](PyLocalBuffer* buffer) -> StatusOr { + buffer->client()->py_ref_manager().CollectGarbage(); + std::shared_ptr literal; + { + py::gil_scoped_release gil_release; + TF_ASSIGN_OR_RETURN(literal, buffer->ToLiteral()); + } + return LiteralToPython(std::move(literal)); + }) .def("shape", &PyLocalBuffer::on_host_shape) .def("device", &PyLocalBuffer::device_ordinal) .def("is_deleted", @@ -640,6 +699,6 @@ PYBIND11_MODULE(xla_extension, m) { py::class_(m, "ChannelHandle"); tensorflow::AddXrtSubmodule(&m); -} +} // NOLINT(readability/fn_size) } // namespace xla From 68595d89ce5158d9ba232684082c2b87fa0446be Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 06:32:13 -0700 Subject: [PATCH 0290/3053] Implement a cumulative log-sum-exp operation. PiperOrigin-RevId: 259315584 --- .../api_def_CumulativeLogsumexp.pbtxt | 50 ++++++++ tensorflow/core/kernels/scan_ops.cc | 40 +++++- tensorflow/core/kernels/scan_ops.h | 35 ++++++ tensorflow/core/kernels/scan_ops_gpu.h | 26 +++- .../core/kernels/scan_ops_gpu_double.cu.cc | 2 + .../core/kernels/scan_ops_gpu_float.cu.cc | 2 + .../core/kernels/scan_ops_gpu_half.cu.cc | 2 + tensorflow/core/ops/math_ops.cc | 10 ++ tensorflow/python/kernel_tests/BUILD | 16 +++ .../kernel_tests/cumulative_logsumexp_test.py | 114 ++++++++++++++++++ tensorflow/python/ops/math_grad.py | 34 ++++++ tensorflow/python/ops/math_ops.py | 55 +++++++++ .../tools/api/golden/v1/tensorflow.math.pbtxt | 4 + .../api/golden/v1/tensorflow.raw_ops.pbtxt | 4 + .../tools/api/golden/v2/tensorflow.math.pbtxt | 4 + .../api/golden/v2/tensorflow.raw_ops.pbtxt | 4 + 16 files changed, 396 insertions(+), 6 deletions(-) create mode 100644 tensorflow/core/api_def/base_api/api_def_CumulativeLogsumexp.pbtxt create mode 100644 tensorflow/python/kernel_tests/cumulative_logsumexp_test.py diff --git a/tensorflow/core/api_def/base_api/api_def_CumulativeLogsumexp.pbtxt b/tensorflow/core/api_def/base_api/api_def_CumulativeLogsumexp.pbtxt new file mode 100644 index 00000000000..7db367c71bd --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_CumulativeLogsumexp.pbtxt @@ -0,0 +1,50 @@ +op { + graph_op_name: "CumulativeLogsumexp" + visibility: HIDDEN + in_arg { + name: "x" + description: < [a, log(exp(a) + exp(b)), log(exp(a) + exp(b) + exp(c))] +``` + +By setting the `exclusive` kwarg to `True`, an exclusive cumulative log-sum-exp is +performed instead: +```python +tf.cumulative_logsumexp([a, b, c], exclusive=True) # => [-inf, a, log(exp(a) * exp(b))] +``` +Note that the neutral element of the log-sum-exp operation is `-inf`, +however, for performance reasons, the minimal value representable by the +floating point type is used instead. + +By setting the `reverse` kwarg to `True`, the cumulative log-sum-exp is performed in the +opposite direction. +END +} diff --git a/tensorflow/core/kernels/scan_ops.cc b/tensorflow/core/kernels/scan_ops.cc index 87e8aa4b761..20f6b864fd8 100644 --- a/tensorflow/core/kernels/scan_ops.cc +++ b/tensorflow/core/kernels/scan_ops.cc @@ -18,6 +18,9 @@ limitations under the License. #define EIGEN_USE_GPU #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM +#include "tensorflow/core/kernels/scan_ops.h" + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/bounds_check.h" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -25,10 +28,6 @@ limitations under the License. #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/types.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" - -#include "tensorflow/core/kernels/scan_ops.h" - namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; @@ -107,8 +106,12 @@ namespace functor { TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_ALL_REDUCERS); DECLARE_FOR_ALL_REDUCERS(int32); DECLARE_FOR_ALL_REDUCERS(int64); - #undef DECLARE_FOR_ALL_REDUCERS + +#define DECLARE_FOR_LOGSUMEXP_REDUCER(T) DECLARE(LogSumExpReducer, T); +TF_CALL_GPU_NUMBER_TYPES(DECLARE_FOR_LOGSUMEXP_REDUCER) +#undef DECLARE_FOR_LOGSUMEXP_REDUCER + #undef DECLARE } // namespace functor @@ -192,4 +195,31 @@ REGISTER_GPU_KERNELS(int64); #undef REGISTER_GPU_KERNELS #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM +#define REGISTER_CUMLOGSUMEXP_KERNEL(device, device_type, type, type_idx) \ + REGISTER_KERNEL_BUILDER( \ + Name("CumulativeLogsumexp") \ + .Device(device) \ + .TypeConstraint("T") \ + .TypeConstraint("Tidx") \ + .HostMemory("axis"), \ + ScanOp, type_idx>) + +#define REGISTER_CPU_KERNELS(type) \ + REGISTER_CUMLOGSUMEXP_KERNEL(DEVICE_CPU, CPUDevice, type, int32) \ + REGISTER_CUMLOGSUMEXP_KERNEL(DEVICE_CPU, CPUDevice, type, int64) + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_CPU_KERNELS); +#undef REGISTER_CPU_KERNELS + +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM +#define REGISTER_GPU_KERNELS(type) \ + REGISTER_CUMLOGSUMEXP_KERNEL(DEVICE_GPU, GPUDevice, type, int32) \ + REGISTER_CUMLOGSUMEXP_KERNEL(DEVICE_GPU, GPUDevice, type, int64) + +TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS); +#undef REGISTER_GPU_KERNELS +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM + +#undef REGISTER_CUMLOGSUMEXP_KERNEL + } // namespace tensorflow diff --git a/tensorflow/core/kernels/scan_ops.h b/tensorflow/core/kernels/scan_ops.h index 13831bb377d..1fd98f6656d 100644 --- a/tensorflow/core/kernels/scan_ops.h +++ b/tensorflow/core/kernels/scan_ops.h @@ -40,6 +40,41 @@ struct Scan { } }; +template +struct LogSumExp { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& a, + const T& b) const { + Eigen::internal::scalar_sum_op sum_op; + Eigen::internal::scalar_exp_op exp_op; + Eigen::internal::scalar_log_op log_op; + Eigen::internal::scalar_max_op max_op; + Eigen::internal::scalar_min_op min_op; + Eigen::internal::scalar_log1p_op log1p_op; + Eigen::internal::scalar_difference_op diff_op; + + auto mi = min_op(a, b); + auto ma = max_op(a, b); + + return sum_op(log1p_op(exp_op(diff_op(mi, ma))), ma); + } +}; + +template +struct LogSumExpReducer { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { + LogSumExp logsumexp; + *accum = logsumexp(*accum, t); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { + return Eigen::NumTraits::lowest(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { + return accum; + } +}; + } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/scan_ops_gpu.h b/tensorflow/core/kernels/scan_ops_gpu.h index 1d3cb35517d..eaa9360a5b7 100644 --- a/tensorflow/core/kernels/scan_ops_gpu.h +++ b/tensorflow/core/kernels/scan_ops_gpu.h @@ -143,9 +143,16 @@ struct IsProd { std::is_same>::value); }; +template +struct IsLogSumExp { + constexpr static bool value = (std::is_same>::value || + std::is_same>::value); +}; + template struct IdentityValue { - static_assert(IsSum::value || IsProd::value, + static_assert(IsSum::value || IsProd::value || + IsLogSumExp::value, "IdentityValue not yet defined for this type."); template @@ -159,6 +166,13 @@ struct IdentityValue { typename std::enable_if::value, U>::type t = U(1)) { return t; } + + template + __host__ __device__ U + operator()(typename std::enable_if::value, U>::type t = + U(Eigen::NumTraits::lowest())) { + return t; + } }; // Each block is mapped to one sequence. A contiguous range is mapped to the @@ -311,6 +325,16 @@ struct Scan, T> { } }; +template +struct Scan, T> { + void operator()(const GPUDevice& d, typename TTypes::ConstTensor in, + typename TTypes::Tensor out, + const LogSumExpReducer& reducer, const bool reverse, + const bool exclusive) { + LaunchScan>(d, in, out, LogSumExp(), reverse, exclusive); + } +}; + } // namespace functor } // end namespace tensorflow diff --git a/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc b/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc index f304c5cc53c..199a477b560 100644 --- a/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc +++ b/tensorflow/core/kernels/scan_ops_gpu_double.cu.cc @@ -26,6 +26,8 @@ template struct functor::Scan, double>; template struct functor::Scan, double>; +template struct functor::Scan, + double>; } // namespace tensorflow #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc b/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc index 1d0780541cc..6704572c1cf 100644 --- a/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc +++ b/tensorflow/core/kernels/scan_ops_gpu_float.cu.cc @@ -26,6 +26,8 @@ template struct functor::Scan, float>; template struct functor::Scan, float>; +template struct functor::Scan, + float>; } // namespace tensorflow #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc b/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc index 3ea7c5a47c7..0b16cb79ab8 100644 --- a/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc +++ b/tensorflow/core/kernels/scan_ops_gpu_half.cu.cc @@ -26,6 +26,8 @@ template struct functor::Scan< GpuDevice, Eigen::internal::SumReducer, Eigen::half>; template struct functor::Scan< GpuDevice, Eigen::internal::ProdReducer, Eigen::half>; +template struct functor::Scan, + Eigen::half>; } // namespace tensorflow #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc index d87d377b8c7..e68126209e4 100644 --- a/tensorflow/core/ops/math_ops.cc +++ b/tensorflow/core/ops/math_ops.cc @@ -1597,6 +1597,16 @@ REGISTER_OP("Cumprod") .Attr("Tidx: {int32, int64} = DT_INT32") .SetShapeFn(shape_inference::UnchangedShape); +REGISTER_OP("CumulativeLogsumexp") + .Input("x : T") + .Input("axis: Tidx") + .Attr("exclusive: bool = false") + .Attr("reverse: bool = false") + .Output("out: T") + .Attr("T: {float16, float32, float64}") + .Attr("Tidx: {int32, int64} = DT_INT32") + .SetShapeFn(shape_inference::UnchangedShape); + REGISTER_OP("QuantizedMatMul") .Input("a: T1") .Input("b: T2") diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index 0b176ecbbf7..3ce1ee7d151 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -316,6 +316,22 @@ cuda_py_test( xla_enable_strict_auto_jit = True, ) +cuda_py_test( + name = "cumulative_logsumexp_test", + size = "medium", + srcs = ["cumulative_logsumexp_test.py"], + additional_deps = [ + "//third_party/py/numpy", + "//tensorflow/python:client_testlib", + "//tensorflow/python:errors", + "//tensorflow/python:framework_for_generated_wrappers", + "//tensorflow/python:math_ops", + "//tensorflow/python:map_fn", + "//tensorflow/python:array_ops", + ], + xla_enable_strict_auto_jit = True, +) + tf_py_test( name = "decode_csv_op_test", size = "small", diff --git a/tensorflow/python/kernel_tests/cumulative_logsumexp_test.py b/tensorflow/python/kernel_tests/cumulative_logsumexp_test.py new file mode 100644 index 00000000000..aae624f6605 --- /dev/null +++ b/tensorflow/python/kernel_tests/cumulative_logsumexp_test.py @@ -0,0 +1,114 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Functional tests for cumulative_logsumexp op.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import gradient_checker_v2 +from tensorflow.python.ops import map_fn +from tensorflow.python.ops import math_ops +from tensorflow.python.platform import test + + +class CumulativeLogsumexpTest(test.TestCase): + valid_dtypes = [dtypes.float32, dtypes.float64] + + def _computeLogSumExp(self, x, **kwargs): + result_naive = math_ops.cumsum(math_ops.exp(x), **kwargs) + result_fused = math_ops.exp(math_ops.cumulative_logsumexp(x, **kwargs)) + return result_naive, result_fused + + def _testLogSumExp(self, x, dtype=dtypes.float32, use_gpu=False, **kwargs): + with self.cached_session(use_gpu=use_gpu): + x = ops.convert_to_tensor(x, dtype=dtype) + + result_naive, result_fused = self.evaluate( + self._computeLogSumExp(x, **kwargs)) + + self.assertAllClose(result_naive, result_fused) + + def _testLogSumExpAllArgs(self, x, axis=0, use_gpu=False): + for dtype in self.valid_dtypes: + for reverse in (True, False): + for exclusive in (True, False): + self._testLogSumExp( + x, dtype=dtype, use_gpu=use_gpu, + reverse=reverse, exclusive=exclusive, + axis=axis) + + def test1D(self): + x = np.arange(10) / 10.0 - 0.5 + self._testLogSumExpAllArgs(x, use_gpu=False) + self._testLogSumExpAllArgs(x, use_gpu=True) + + def test2D(self): + x = np.reshape(np.arange(20) / 20.0 - 0.5, (2, 10)) + + for axis in (-2, -1, 0, 1): + self._testLogSumExpAllArgs(x, axis=axis, use_gpu=False) + self._testLogSumExpAllArgs(x, axis=axis, use_gpu=True) + + def _testGradient(self, x, use_gpu=False, **kwargs): + with self.cached_session(use_gpu=use_gpu): + x = ops.convert_to_tensor(x, dtype=dtypes.float64) + + grad_naive_theoretical, _ = gradient_checker_v2.compute_gradient( + lambda y: math_ops.cumsum(math_ops.exp(y), **kwargs), [x]) + grad_fused_theoretical, _ = gradient_checker_v2.compute_gradient( + lambda y: math_ops.exp(math_ops.cumulative_logsumexp(y, **kwargs)), + [x]) + + self.assertAllClose(grad_fused_theoretical, grad_naive_theoretical) + + def testGradient(self): + for reverse in (True, False): + for exclusive in (True, False): + x = np.arange(10) / 10.0 - 0.5 + + self._testGradient(x, use_gpu=False, + reverse=reverse, exclusive=exclusive) + self._testGradient(x, use_gpu=True, + reverse=reverse, exclusive=exclusive) + + def _logSumExpMap(self, x): + return map_fn.map_fn( + lambda i: math_ops.reduce_logsumexp(x[:i + 1]), + math_ops.range(array_ops.shape(x)[0]), + dtype=x.dtype) + + def test1DLarge(self): + # This test ensures that the operation is correct even when the naive + # implementation would overflow. + x_np = np.arange(20) * 20.0 + + for use_gpu in (True, False): + with self.cached_session(use_gpu=use_gpu): + x_tf = ops.convert_to_tensor(x_np, dtype=dtypes.float32) + + result_fused = self.evaluate(math_ops.cumulative_logsumexp(x_tf)) + result_map = self.evaluate(self._logSumExpMap(x_tf)) + + self.assertAllClose(result_fused, result_map) + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py index 0db8953b696..31e5895fd0b 100644 --- a/tensorflow/python/ops/math_grad.py +++ b/tensorflow/python/ops/math_grad.py @@ -1641,6 +1641,40 @@ def _CumprodGrad(op, grad): return [out / x, None] +@ops.RegisterGradient("CumulativeLogsumexp") +def _CumulativeLogsumexpGrad(op, grad): + x = op.inputs[0] + axis = op.inputs[1] + cumulative_logsumexp = op.outputs[0] + + exclusive = op.get_attr("exclusive") + reverse = op.get_attr("reverse") + + # Split the incoming gradient into positive and negative part + # in order to take logs. This is required for stable results. + log_grad_positive = array_ops.where_v2( + math_ops.greater(grad, 0), + math_ops.log(grad), + grad.dtype.min) + + log_grad_negative = array_ops.where_v2( + math_ops.less(grad, 0), + math_ops.log(-grad), + grad.dtype.min) + + output_pos = math_ops.exp( + math_ops.cumulative_logsumexp( + log_grad_positive - cumulative_logsumexp, + axis=axis, reverse=not reverse, exclusive=exclusive) + x) + + output_neg = math_ops.exp( + math_ops.cumulative_logsumexp( + log_grad_negative - cumulative_logsumexp, + axis=axis, reverse=not reverse, exclusive=exclusive) + x) + + return [output_pos - output_neg, None] + + @ops.RegisterGradient("NextAfter") def _NextAfterGrad(op, grad): """Returns gradient of nextafter(x1, x2) with respect to x1 and x2.""" diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 84372b3c922..9becce79cb1 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -3297,6 +3297,61 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None): x, axis, exclusive=exclusive, reverse=reverse, name=name) +@tf_export("math.cumulative_logsumexp", v1=["math.cumulative_logsumexp"]) +def cumulative_logsumexp(x, axis=0, exclusive=False, reverse=False, name=None): + """Compute the cumulative log-sum-exp of the tensor `x` along `axis`. + + By default, this op performs an inclusive cumulative log-sum-exp, which means + that the first element of the input is identical to the first element of + the output. + + This operation is significantly more numerically stable than the equivalent + tensorflow operation `tf.math.log(tf.math.cumsum(tf.math.exp(x)))`, although + computes the same result given infinite numerical precision. However, note + that in some cases, it may be less stable than `tf.math.reduce_logsumexp` + for a given element, as it applies the "log-sum-exp trick" in a different + way. + + More precisely, where `tf.math.reduce_logsumexp` uses the following trick: + + ``` + log(sum(exp(x))) == log(sum(exp(x - max(x)))) + max(x) + ``` + + it cannot be directly used here as there is no fast way of applying it + to each prefix `x[:i]`. Instead, this function implements a prefix + scan using pairwise log-add-exp, which is a commutative and associative + (up to floating point precision) operator: + + ``` + log_add_exp(x, y) = log(exp(x) + exp(y)) + = log(1 + exp(min(x, y) - max(x, y))) + max(x, y) + ``` + + However, reducing using the above operator leads to a different computation + tree (logs are taken repeatedly instead of only at the end), and the maximum + is only computed pairwise instead of over the entire prefix. In general, this + leads to a different and slightly less precise computation. + + Args: + x: A `Tensor`. Must be one of the following types: `float16`, `float32`, + `float64`. + axis: A `Tensor` of type `int32` or `int64` (default: 0). Must be in the + range `[-rank(x), rank(x))`. + exclusive: If `True`, perform exclusive cumulative log-sum-exp. + reverse: If `True`, performs the cumulative log-sum-exp in the reverse + direction. + name: A name for the operation (optional). + + Returns: + A `Tensor`. Has the same shape and type as `x`. + """ + with ops.name_scope(name, "CumulativeLogsumexp", [x]) as name: + x = ops.convert_to_tensor(x, name="x") + return gen_math_ops.cumulative_logsumexp( + x, axis, exclusive=exclusive, reverse=reverse, name=name) + + @tf_export("math.conj", v1=["math.conj", "conj"]) @dispatch.add_dispatch_support @deprecation.deprecated_endpoints("conj") diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt index 1fd765a5f81..5e3376d84c9 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt @@ -112,6 +112,10 @@ tf_module { name: "cumsum" argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], " } + member_method { + name: "cumulative_logsumexp" + argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], " + } member_method { name: "digamma" argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt index eef3ed54817..c247479d35e 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt @@ -844,6 +844,10 @@ tf_module { name: "Cumsum" argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], " } + member_method { + name: "CumulativeLogsumexp" + argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], " + } member_method { name: "DataFormatDimMap" argspec: "args=[\'x\', \'src_format\', \'dst_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'NCHW\', \'None\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt index 3ec5c656b3f..f0f6373a5a5 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt @@ -112,6 +112,10 @@ tf_module { name: "cumsum" argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], " } + member_method { + name: "cumulative_logsumexp" + argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], " + } member_method { name: "digamma" argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt index eef3ed54817..c247479d35e 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt @@ -844,6 +844,10 @@ tf_module { name: "Cumsum" argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], " } + member_method { + name: "CumulativeLogsumexp" + argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], " + } member_method { name: "DataFormatDimMap" argspec: "args=[\'x\', \'src_format\', \'dst_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'NCHW\', \'None\'], " From 1aaccb3fd98d30aacbd167a09be965b7f8a760eb Mon Sep 17 00:00:00 2001 From: Rachel Lim Date: Mon, 22 Jul 2019 07:21:35 -0700 Subject: [PATCH 0291/3053] [tf.data] Update rebatching to use a "fallback" method when it can't find a Batch dataset. Also fixes some edge cases. PiperOrigin-RevId: 259322208 --- .../core/grappler/optimizers/data/BUILD | 1 + .../grappler/optimizers/data/graph_utils.cc | 13 + .../grappler/optimizers/data/graph_utils.h | 3 + .../core/grappler/optimizers/data/rebatch.cc | 420 +++++++++++++++--- .../core/grappler/optimizers/data/rebatch.h | 1 + .../data/experimental/rebatch_dataset_op.cc | 19 +- .../core/ops/experimental_dataset_ops.cc | 2 + .../kernel_tests/rebatch_dataset_test.py | 97 +++- .../data/experimental/ops/distribute.py | 10 +- .../distribute/distribute_strategy_test.py | 21 +- .../api/golden/v1/tensorflow.raw_ops.pbtxt | 4 +- .../api/golden/v2/tensorflow.raw_ops.pbtxt | 4 +- 12 files changed, 522 insertions(+), 73 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD index 8fffe36e84d..6db3c5a40ff 100644 --- a/tensorflow/core/grappler/optimizers/data/BUILD +++ b/tensorflow/core/grappler/optimizers/data/BUILD @@ -682,6 +682,7 @@ cc_library( srcs = ["rebatch.cc"], hdrs = ["rebatch.h"], deps = [ + ":function_utils", ":graph_utils", ":optimizer_base", "@com_google_absl//absl/container:flat_hash_map", diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc index 758f7786aff..a11717e270a 100644 --- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc +++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc @@ -17,6 +17,7 @@ limitations under the License. #include "tensorflow/core/framework/device_base.h" #include "tensorflow/core/framework/op_def.pb.h" +#include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/map_util.h" #include "tensorflow/core/util/ptr_util.h" @@ -239,6 +240,18 @@ NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph, return graph.GetRegularFanin(input_port).node; } +Status GetDatasetOutputTypesAttr(const NodeDef& node, AttrValue* output_types) { + // We don't name the output_types attr consistently, so should check for both. + for (const string& attr_name : {"output_types", "Toutput_types"}) { + if (node.attr().contains(attr_name)) { + *output_types = node.attr().at(attr_name); + return Status::OK(); + } + } + return errors::InvalidArgument("Could not find output_types attr for node: ", + node.name(), " with op: ", node.op()); +} + void SetUniqueGraphNodeName(StringPiece prefix, GraphDef* graph, NodeDef* node) { string name = string(prefix); diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h index 417a8c4ffd1..341eec46158 100644 --- a/tensorflow/core/grappler/optimizers/data/graph_utils.h +++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h @@ -113,6 +113,9 @@ NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph); NodeDef* GetInputNode(const NodeDef& node, const MutableGraphView& graph, int64 i); +// Gets the attr corresponding to a dataset node's output types, if it exists. +Status GetDatasetOutputTypesAttr(const NodeDef& node, AttrValue* output_types); + // Returns the list of indices of all nodes with the given op or empty list if // no such node exists. std::vector FindAllGraphNodesWithOp(const string& op, diff --git a/tensorflow/core/grappler/optimizers/data/rebatch.cc b/tensorflow/core/grappler/optimizers/data/rebatch.cc index b3e7f8febe3..bcea9eea8fd 100644 --- a/tensorflow/core/grappler/optimizers/data/rebatch.cc +++ b/tensorflow/core/grappler/optimizers/data/rebatch.cc @@ -18,11 +18,13 @@ limitations under the License. #include "absl/container/flat_hash_map.h" #include "tensorflow/core/framework/attr_value_util.h" #include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/versions.pb.h" #include "tensorflow/core/grappler/grappler_item.h" #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h" +#include "tensorflow/core/grappler/optimizers/data/function_utils.h" #include "tensorflow/core/grappler/optimizers/data/graph_utils.h" #include "tensorflow/core/grappler/utils/functions.h" #include "tensorflow/core/lib/core/errors.h" @@ -32,9 +34,12 @@ namespace grappler { Status RebatchOptimizer::Init( const tensorflow::RewriterConfig_CustomGraphOptimizer* config) { - if (!config) return Status::OK(); + if (!config) + return errors::InvalidArgument( + "Cannot initialize RebatchOptimizer without config."); num_workers_ = config->parameter_map().at("num_workers").i(); + use_fallback_ = config->parameter_map().at("use_fallback").b(); return Status::OK(); } @@ -59,6 +64,11 @@ constexpr std::array kMultipleInputsDatasetOps = { "ZipDataset" }; +// TODO(rachelim): We might want to be more conservative here and not allow +// passthrough for ops like "Map", "ParallelMap" etc which may change the +// batch dimension. Furthermore, transformations like "Skip" may change +// the semantics of the dataset (since we'd be skipping N minibatches instead +// of N batches). constexpr std::array kPassThroughOps = { "CacheDataset", "ExperimentalScanDataset", @@ -119,6 +129,97 @@ NodeDef* AddBinaryNode(const string& input_x, const string& input_y, return graph->AddNode(std::move(node)); } +// Adds a Const node to the FunctionDef. +Status AddConstIntNode(gtl::ArraySlice values, const TensorShape& shape, + FunctionDef* fdef, NodeDef** result) { + if (shape.dims() > 1) { + return errors::InvalidArgument("Cannot add const node with rank > 1"); + } + *result = fdef->add_node_def(); + TensorProto tensor_proto; + tensor_proto.set_dtype(DT_INT32); + if (shape.dims() == 0) { + // Scalar + DCHECK_EQ(values.size(), 1); + } else { + // vector + DCHECK_EQ(values.size(), shape.dim_size(0)); + tensor_proto.mutable_tensor_shape()->add_dim()->set_size(shape.dim_size(0)); + } + + for (int value : values) { + *tensor_proto.mutable_int_val()->Add() = value; + } + + TF_RETURN_IF_ERROR(NodeDefBuilder("", "Const") + .Attr("dtype", DT_INT32) + .Attr("value", tensor_proto) + .Finalize(*result)); + function_utils::SetUniqueFunctionNodeName("rebatch/const", fdef, *result); + + return Status::OK(); +} + +Status AddShapeNode(const NodeDefBuilder::NodeOut& input, FunctionDef* fdef, + NodeDef** result) { + *result = fdef->add_node_def(); + TF_RETURN_IF_ERROR( + NodeDefBuilder("", "Shape").Input(input).Finalize(*result)); + function_utils::SetUniqueFunctionNodeName("rebatch/shape", fdef, *result); + return Status::OK(); +} + +Status AddStridedSliceNode(const NodeDefBuilder::NodeOut& input, + const NodeDefBuilder::NodeOut& begin, + const NodeDefBuilder::NodeOut& end, + const NodeDefBuilder::NodeOut& strides, + DataType index, int32 begin_mask, + int32 ellipsis_mask, int32 end_mask, + int32 new_axis_mask, int32 shrink_axis_mask, + FunctionDef* fdef, NodeDef** result) { + *result = fdef->add_node_def(); + TF_RETURN_IF_ERROR(NodeDefBuilder("", "StridedSlice") + .Input(input) + .Input(begin) + .Input(end) + .Input(strides) + .Attr("Index", index) + .Attr("begin_mask", begin_mask) + .Attr("ellipsis_mask", ellipsis_mask) + .Attr("end_mask", end_mask) + .Attr("new_axis_mask", new_axis_mask) + .Attr("shrink_axis_mask", shrink_axis_mask) + .Finalize(*result)); + function_utils::SetUniqueFunctionNodeName("rebatch/strided_slice", fdef, + *result); + return Status::OK(); +} + +Status AddConcatNode(gtl::ArraySlice values, + NodeDefBuilder::NodeOut axis, int32 n, FunctionDef* fdef, + NodeDef** result) { + *result = fdef->add_node_def(); + TF_RETURN_IF_ERROR(NodeDefBuilder("", "ConcatV2") + .Input(values) + .Input(axis) + .Attr("N", n) + .Finalize(*result)); + function_utils::SetUniqueFunctionNodeName("rebatch/concat", fdef, *result); + return Status::OK(); +} + +Status AddReshapeNode(NodeDefBuilder::NodeOut tensor, + NodeDefBuilder::NodeOut shape, FunctionDef* fdef, + NodeDef** result) { + *result = fdef->add_node_def(); + TF_RETURN_IF_ERROR(NodeDefBuilder("", "Reshape") + .Input(tensor) + .Input(shape) + .Finalize(*result)); + function_utils::SetUniqueFunctionNodeName("rebatch/reshape", fdef, *result); + return Status::OK(); +} + template bool IsDatasetNodeOfType(const NodeDef& node, const std::array& arr) { @@ -128,19 +229,56 @@ bool IsDatasetNodeOfType(const NodeDef& node, return false; } +void SetUnknownShapes(int num_components, AttrValue* output_shapes) { + for (int i = 0; i < num_components; ++i) { + output_shapes->mutable_list()->mutable_shape()->Add()->set_unknown_rank( + true); + } +} + +Status GetBatchDim(AttrValue output_shapes, int* batch_dim) { + const auto& shape_0 = output_shapes.list().shape(0); + if (shape_0.unknown_rank() || shape_0.dim(0).size() == -1) { + return errors::InvalidArgument( + "Cannot use rebatching fallback when 0th dimensions of dataset " + "components are not fully known. Component 0 has shape: ", + shape_0.ShortDebugString()); + } + + *batch_dim = output_shapes.list().shape(0).dim(0).size(); + + for (int i = 1; i < output_shapes.list().shape_size(); ++i) { + const auto& shape_i = output_shapes.list().shape(i); + + if (shape_i.unknown_rank() || shape_i.dim(0).size() == -1) { + return errors::InvalidArgument( + "Cannot use rebatching fallback when 0th dimensions of dataset " + "components are not fully known. Component ", + i, " has shape: ", shape_i.ShortDebugString()); + } + if (shape_i.dim(0).size() != *batch_dim) { + return errors::InvalidArgument( + "Cannot use rebatching fallback when 0th dimensions of dataset " + "components don't match. Component ", + i, " has batch dimension: ", shape_i.dim(0).size(), + " while previous components have batch dimension: ", *batch_dim); + } + } + return Status::OK(); +} + Status UpdateOutputShapes(const string& node_name, int64 num_workers, MutableGraphView* graph) { NodeDef* node = graph->GetNode(node_name); - if (node->op() == kIdentityOp) { - return Status::OK(); - } - AttrValue output_shapes = node->attr().at("output_shapes"); - for (auto& shape : *output_shapes.mutable_list()->mutable_shape()) { - if (shape.dim(0).size() != -1) { - shape.mutable_dim(0)->set_size(shape.dim(0).size() / num_workers); + if (node->attr().contains("output_shapes")) { + AttrValue output_shapes = node->attr().at("output_shapes"); + for (auto& shape : *output_shapes.mutable_list()->mutable_shape()) { + if (!shape.unknown_rank() && shape.dim(0).size() != -1) { + shape.mutable_dim(0)->set_size(shape.dim(0).size() / num_workers); + } } + (*node->mutable_attr())["output_shapes"] = output_shapes; } - (*node->mutable_attr())["output_shapes"] = output_shapes; return Status::OK(); } @@ -193,7 +331,7 @@ Status MutateBatchSize(const NodeDef& node, int64 num_workers, } Status OptimizeGraph(const GrapplerItem& item, int64 num_workers, - GraphDef* output); + bool use_fallback, GraphDef* output); // Helper function that starts from a node in the graph and recurses into its // inputs trying to find a BatchDataset type operation to modify. During the @@ -204,26 +342,24 @@ Status OptimizeGraph(const GrapplerItem& item, int64 num_workers, // 3. Core dataset ops + Identity op: Recurses into first input parameter. // 4. FlatMap type mapping dataset ops: Recurses into the function definition. Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers, - FunctionLibraryDefinition* flib, + bool use_fallback, FunctionLibraryDefinition* flib, MutableGraphView* graph) { if (IsDatasetNodeOfType(node, kBatchDatasetOps)) { TF_RETURN_IF_ERROR(MutateBatchSize(node, num_workers, graph)); - TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_workers, graph)); } else if (IsDatasetNodeOfType(node, kMultipleInputsDatasetOps)) { // For all multiple input datasets, all inputs are datasets themselves. for (int i = 0; i < node.input_size(); ++i) { NodeDef* input_node = graph_utils::GetInputNode(node, *graph, i); - TF_RETURN_IF_ERROR( - RecursivelyHandleOp(*input_node, num_workers, flib, graph)); + TF_RETURN_IF_ERROR(RecursivelyHandleOp(*input_node, num_workers, + use_fallback, flib, graph)); } - TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_workers, graph)); - } else if (IsDatasetNodeOfType(node, kPassThroughOps)) { - // For all the dataset ops that are pass through, the input dataset is + } else if (IsDatasetNodeOfType(node, kPassThroughOps) || IsRetval(node)) { + // For all the dataset ops that are passthrough, or _Retvals added to the + // function body graph in place of function outputs, the input dataset is // input 0. NodeDef* input_node = graph_utils::GetInputNode(node, *graph, 0); - TF_RETURN_IF_ERROR( - RecursivelyHandleOp(*input_node, num_workers, flib, graph)); - TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_workers, graph)); + TF_RETURN_IF_ERROR(RecursivelyHandleOp(*input_node, num_workers, + use_fallback, flib, graph)); } else if (IsDatasetNodeOfType(node, kFuncDatasetOps)) { const string func_name = node.attr().at(kFuncDatasetOpFuncs->at(node.op())).func().name(); @@ -232,42 +368,210 @@ Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers, TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem( *fdef, *flib, graph->graph()->versions().producer(), &f_item)); GraphDef optimized_func_graph; - Status s = OptimizeGraph(f_item, num_workers, &optimized_func_graph); - if (s.ok()) { - // Function body optimization might have created new specialized - // functions for each instantiation context. Add them to the library. - for (const FunctionDef& func_def : - optimized_func_graph.library().function()) { - if (flib->Find(func_def.signature().name()) == nullptr) { - TF_RETURN_IF_ERROR(flib->AddFunctionDef(func_def)); - } + TF_RETURN_IF_ERROR(OptimizeGraph(f_item, num_workers, use_fallback, + &optimized_func_graph)); + + // Function body optimization might have created new specialized + // functions for each instantiation context. Add them to the library. + for (const FunctionDef& func_def : + optimized_func_graph.library().function()) { + if (flib->Find(func_def.signature().name()) == nullptr) { + TF_RETURN_IF_ERROR(flib->AddFunctionDef(func_def)); } - - // Convert optimized graph back to FunctionDef. - FunctionDef optimized_func; - f_item.SwapFunctionBody(std::move(optimized_func_graph)); - TF_RETURN_IF_ERROR(MakeFunctionDef(f_item, *flib, &optimized_func)); - - // Replace optimized function with a new FunctionDef. - TF_RETURN_IF_ERROR(flib->ReplaceFunction(func_name, optimized_func)); - TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_workers, graph)); - } else { - VLOG(2) << "Failed to optimize dataset function. Error: " - << s.error_message(); } + + // Convert optimized graph back to FunctionDef. + FunctionDef optimized_func; + f_item.SwapFunctionBody(std::move(optimized_func_graph)); + TF_RETURN_IF_ERROR(MakeFunctionDef(f_item, *flib, &optimized_func)); + + // Replace optimized function with a new FunctionDef. + TF_RETURN_IF_ERROR(flib->ReplaceFunction(func_name, optimized_func)); } else if (IsDatasetNodeOfType(node, kSourceDatasetOps)) { return errors::InvalidArgument( "Reached a source dataset: ", node.op(), " without encountering a batch transformation."); - } else if (IsRetval(node)) { - // _Retvals added to the function body graph in place of function outputs. - NodeDef* input_node = graph_utils::GetInputNode(node, *graph, 0); - TF_RETURN_IF_ERROR( - RecursivelyHandleOp(*input_node, num_workers, flib, graph)); } else { return errors::InvalidArgument("Encountered an unsupported op: ", node.op()); } + // If we've successfully updated the batch size of this node or any nodes + // in the dataset tree rooted in this node, we update the output_shapes attr. + TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_workers, graph)); + return Status::OK(); +} + +// Add nodes to the function to reshape arg to shape (-1, new_batch_dim, ...) +Status ReshapeComponent(int new_batch_dim, StringPiece arg, DataType dtype, + FunctionDef* fdef, string* result) { + // Const with value [0] + NodeDef* const_vec_0; + TF_RETURN_IF_ERROR(AddConstIntNode({0}, {1}, fdef, &const_vec_0)); + + // Const with value [1] + NodeDef* const_vec_1; + TF_RETURN_IF_ERROR(AddConstIntNode({1}, {1}, fdef, &const_vec_1)); + + // Const with value 0 + NodeDef* const_0; + TF_RETURN_IF_ERROR(AddConstIntNode({0}, {}, fdef, &const_0)); + + // Const with value [-1, new_batch_dim] + NodeDef* first_two_dims; + TF_RETURN_IF_ERROR( + AddConstIntNode({-1, new_batch_dim}, {2}, fdef, &first_two_dims)); + + // shape = tf.shape(arg) + NodeDef* shape; + TF_RETURN_IF_ERROR(AddShapeNode({arg, 0, dtype}, fdef, &shape)); + + // later_dimensions = tf.shape(arg)[1:] + NodeDef* later_dimensions; + TF_RETURN_IF_ERROR(AddStridedSliceNode( + {strings::StrCat(shape->name(), ":output"), 0, DT_INT32}, + {strings::StrCat(const_vec_1->name(), ":output"), 0, DT_INT32}, + {strings::StrCat(const_vec_0->name(), ":output"), 0, DT_INT32}, + {strings::StrCat(const_vec_1->name(), ":output"), 0, DT_INT32}, DT_INT32, + 0, 0, 1, 0, 0, fdef, &later_dimensions)); + + // new_shape = tf.concat([pack, later_dimensions], 0) + NodeDef* new_shape; + TF_RETURN_IF_ERROR(AddConcatNode( + {{strings::StrCat(first_two_dims->name(), ":output"), 0, DT_INT32}, + {strings::StrCat(later_dimensions->name(), ":output"), 0, DT_INT32}}, + {strings::StrCat(const_0->name(), ":output"), 0, DT_INT32}, 2, fdef, + &new_shape)); + + NodeDef* reshape; + TF_RETURN_IF_ERROR(AddReshapeNode( + {arg, 0, dtype}, + {strings::StrCat(new_shape->name(), ":output"), 0, DT_INT32}, fdef, + &reshape)); + *result = reshape->name(); + + return Status::OK(); +} + +Status CreateFlatMapFn(int new_batch_dim, const AttrValue& types, + FunctionDef* result) { + std::vector tensor_slice_dataset_inputs; + + // For each component of the dataset, we reshape it from shape + // (old_batch_size, ...) to (-1, new_batch_size, ...) + // where new_batch_size = (old_batch_size + num_workers - 1) // num_workers + for (int i = 0; i < types.list().type_size(); ++i) { + string arg = strings::StrCat("args_", i); + auto* input_arg = result->mutable_signature()->mutable_input_arg()->Add(); + input_arg->set_type(types.list().type(i)); + input_arg->set_name(arg); + + string reshape_node_name; + TF_RETURN_IF_ERROR(ReshapeComponent( + new_batch_dim, arg, types.list().type(i), result, &reshape_node_name)); + + tensor_slice_dataset_inputs.emplace_back( + strings::StrCat(reshape_node_name, ":output"), 0, types.list().type(i)); + } + + // The output_shapes attr here doesn't make a difference, since we + // set the output_shapes of the external FlatMap node. + AttrValue shapes; + SetUnknownShapes(types.list().type_size(), &shapes); + + NodeDef* tensor_slice_dataset = result->add_node_def(); + TF_RETURN_IF_ERROR(NodeDefBuilder("", "TensorSliceDataset") + .Input(tensor_slice_dataset_inputs) + .Attr("Toutput_types", types) + .Attr("output_shapes", shapes) + .Finalize(tensor_slice_dataset)); + function_utils::SetUniqueFunctionNodeName("rebatch/tensor_slice_dataset", + result, tensor_slice_dataset); + + auto* output_arg = result->mutable_signature()->mutable_output_arg()->Add(); + output_arg->set_name("output"); + output_arg->set_type(DT_VARIANT); + result->mutable_signature()->set_is_stateful(true); + (*result->mutable_ret())["output"] = + strings::StrCat(tensor_slice_dataset->name(), ":handle:0"); + + return Status::OK(); +} + +// We fallback to the following rewrite: +// ``` +// dataset = ...fetch_node... +// def fn(x): +// return tf.data.Dataset.from_tensor_slices( +// tf.reshape( +// x, +// tf.concat([[-1, old_batch_dim / num_workers], tf.shape(x)[1:]], 0) +// ) +// ) +// +// dataset = dataset.flat_map(fn) +// ``` +Status RebatchWithFallback(const NodeDef* fetch_node, int64 num_workers, + FunctionLibraryDefinition* flib, + MutableGraphView* graph) { + if (IsRetval(*fetch_node) || fetch_node->op() == kIdentityOp) { + // Get the last dataset in the pipeline + fetch_node = graph_utils::GetInputNode(*fetch_node, *graph, 0); + } + + // Note: Here, we are conservative with only using the fallback when + // the output_shapes attr has the 0th dimension defined for every component. + // This because the flat_map_fn will fail if the batch does not divide evenly + // because of the use of the "Reshape" op. This ensures that the error is + // surfaced correctly. + AttrValue output_shapes; + if (!fetch_node->attr().contains("output_shapes")) { + return errors::InvalidArgument( + "Cannot use rebatching fallback without output_shapes attr. Node: ", + fetch_node->name(), " Op: ", fetch_node->op()); + } else { + output_shapes = fetch_node->attr().at("output_shapes"); + } + int batch_dim; + TF_RETURN_IF_ERROR(GetBatchDim(output_shapes, &batch_dim)); + if (batch_dim % num_workers != 0) { + return errors::InvalidArgument( + "Cannot use rebatching fallback when batch dimension doesn't divide " + "num_workers evenly."); + } + + // Create the flat map fn + FunctionDef flat_map_fn; + FunctionDefLibrary lib = flib->ToProto(); + graph_utils::SetUniqueGraphFunctionName("flat_map_fn", &lib, &flat_map_fn); + + // Get types of input arguments from the output types of the final dataset. + AttrValue output_types; + TF_RETURN_IF_ERROR( + graph_utils::GetDatasetOutputTypesAttr(*fetch_node, &output_types)); + TF_RETURN_IF_ERROR( + CreateFlatMapFn(batch_dim / num_workers, output_types, &flat_map_fn)); + + TF_RETURN_IF_ERROR(flib->AddFunctionDef(flat_map_fn)); + AttrValue fn; + fn.mutable_func()->set_name(flat_map_fn.signature().name()); + + NodeDef flat_map_node; + TF_RETURN_IF_ERROR( + NodeDefBuilder("", "FlatMapDataset") + .Input(fetch_node->name(), 0, DT_VARIANT) + .Input(std::vector()) // other_arguments + .Attr("f", fn) + .Attr("Targuments", std::vector()) + .Attr("output_types", output_types) + .Attr("output_shapes", output_shapes) + .Finalize(&flat_map_node)); + graph_utils::SetUniqueGraphNodeName("rebatch/flat_map", graph->graph(), + &flat_map_node); + NodeDef* added = graph->AddNode(std::move(flat_map_node)); + TF_RETURN_IF_ERROR(UpdateOutputShapes(added->name(), num_workers, graph)); + + TF_RETURN_IF_ERROR(graph->UpdateFanouts(fetch_node->name(), added->name())); + return Status::OK(); } @@ -275,7 +579,7 @@ Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers, // with the batch size changed. The GrapplerItem could be generated from the // main graph or could be a function graph. Status OptimizeGraph(const GrapplerItem& item, int64 num_workers, - GraphDef* output) { + bool use_fallback, GraphDef* output) { *output = item.graph; MutableGraphView graph(output); @@ -283,8 +587,24 @@ Status OptimizeGraph(const GrapplerItem& item, int64 num_workers, NodeDef* sink_node; TF_RETURN_IF_ERROR(graph_utils::GetFetchNode(graph, item, &sink_node)); - TF_RETURN_IF_ERROR( - RecursivelyHandleOp(*sink_node, num_workers, &flib, &graph)); + + Status s = + RecursivelyHandleOp(*sink_node, num_workers, use_fallback, &flib, &graph); + if (!s.ok()) { + if (use_fallback) { + VLOG(1) << "Couldn't find a batch transformation. Using a fallback method" + " to rebatch dataset."; + // If RecursivelyHandleOp fails, we reset `graph` to use the original, + // graph, since that function may have mutated `graph`. + *output = item.graph; + graph = MutableGraphView(output); + TF_RETURN_IF_ERROR( + RebatchWithFallback(sink_node, num_workers, &flib, &graph)); + } else { + // Return the error + return s; + } + } *output->mutable_library() = flib.ToProto(); return Status::OK(); } @@ -298,7 +618,7 @@ Status RebatchOptimizer::OptimizeAndCollectStats(Cluster* cluster, *output = item.graph; MutableGraphView graph(output); - TF_RETURN_IF_ERROR(OptimizeGraph(item, num_workers_, output)); + TF_RETURN_IF_ERROR(OptimizeGraph(item, num_workers_, use_fallback_, output)); stats->num_changes++; return Status::OK(); } diff --git a/tensorflow/core/grappler/optimizers/data/rebatch.h b/tensorflow/core/grappler/optimizers/data/rebatch.h index 29a61000264..75c965824cc 100644 --- a/tensorflow/core/grappler/optimizers/data/rebatch.h +++ b/tensorflow/core/grappler/optimizers/data/rebatch.h @@ -44,6 +44,7 @@ class RebatchOptimizer : public TFDataOptimizerBase { private: int64 num_workers_; + bool use_fallback_; }; } // namespace grappler diff --git a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc index b75c2422f21..ac351ebe5e6 100644 --- a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc +++ b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc @@ -21,11 +21,16 @@ namespace data { namespace { constexpr char kOptimizerName[] = "tf_data_rebatcher"; +constexpr char kUseFallbackAttr[] = "use_fallback"; class RebatchDatasetOp : public UnaryDatasetOpKernel { public: explicit RebatchDatasetOp(OpKernelConstruction* ctx) - : UnaryDatasetOpKernel(ctx) {} + : UnaryDatasetOpKernel(ctx) { + if (ctx->HasAttr(kUseFallbackAttr)) { + OP_REQUIRES_OK(ctx, ctx->GetAttr(kUseFallbackAttr, &use_fallback_)); + } + } protected: void MakeDataset(OpKernelContext* ctx, DatasetBase* input, @@ -36,7 +41,9 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel { ctx, num_workers > 0, errors::InvalidArgument("num_workers must be greater than zero.")); - auto config_factory = [num_workers]() { return CreateConfig(num_workers); }; + auto config_factory = [num_workers, this]() { + return CreateConfig(num_workers, this->use_fallback_); + }; // We only want to optimize functions for some particular datasets like // FlatMapDataset, InterleaveDataset etc. So we disable generalized @@ -48,7 +55,7 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel { } private: - static RewriterConfig CreateConfig(int64 num_workers) { + static RewriterConfig CreateConfig(int64 num_workers, bool use_fallback) { RewriterConfig rewriter_config; rewriter_config.set_fail_on_optimizer_errors(true); rewriter_config.add_optimizers(kOptimizerName); @@ -59,8 +66,14 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel { num_workers_attr.set_i(num_workers); (*custom_optimizer->mutable_parameter_map())["num_workers"] = num_workers_attr; + AttrValue use_fallback_attr; + use_fallback_attr.set_b(use_fallback); + (*custom_optimizer->mutable_parameter_map())["use_fallback"] = + use_fallback_attr; return rewriter_config; } + + bool use_fallback_ = true; }; REGISTER_KERNEL_BUILDER(Name("RebatchDataset").Device(DEVICE_CPU), diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc index 1d4c3a0e8ba..5504f5e577b 100644 --- a/tensorflow/core/ops/experimental_dataset_ops.cc +++ b/tensorflow/core/ops/experimental_dataset_ops.cc @@ -662,6 +662,7 @@ REGISTER_OP("ExperimentalRebatchDataset") .Output("handle: variant") .Attr("output_types: list(type) >= 1") .Attr("output_shapes: list(shape) >= 1") + .Attr("use_fallback: bool = true") .SetShapeFn(shape_inference::ScalarShape); REGISTER_OP("RebatchDataset") @@ -670,6 +671,7 @@ REGISTER_OP("RebatchDataset") .Output("handle: variant") .Attr("output_types: list(type) >= 1") .Attr("output_shapes: list(shape) >= 1") + .Attr("use_fallback: bool = true") .SetShapeFn(shape_inference::ScalarShape); REGISTER_OP("SamplingDataset") diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py index 82c13cb8491..c36ea688880 100644 --- a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py @@ -278,7 +278,18 @@ class RebatchDatasetTest(test_base.DatasetTestBase): dataset = dataset_ops.Dataset.range(1024).batch( 32, drop_remainder=drop_remainder).apply(sleep.sleep(10)) with self.assertRaises(errors.InvalidArgumentError): - rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4) + rebatched_dataset = distribute._RebatchDataset( + dataset, num_workers=4, use_fallback=False) + next_element = self.getNext(rebatched_dataset) + self.evaluate(next_element()) + + def testUnsupportedTransformInFlatMapError(self, drop_remainder): + dataset = dataset_ops.Dataset.range(2).flat_map( + lambda _: dataset_ops.Dataset.range(32).batch( # pylint: disable=g-long-lambda + 32, drop_remainder=drop_remainder).apply(sleep.sleep(10))) + with self.assertRaises(errors.InvalidArgumentError): + rebatched_dataset = distribute._RebatchDataset( + dataset, num_workers=4, use_fallback=False) next_element = self.getNext(rebatched_dataset) self.evaluate(next_element()) @@ -433,5 +444,89 @@ class RebatchDatasetTest(test_base.DatasetTestBase): self.assertDatasetProduces(rebatched_dataset, expected_output) +@test_util.run_all_in_graph_and_eager_modes +class RebatchDatasetFallbackTest(test_base.DatasetTestBase): + + def testWithNoBatchDataset(self): + dataset = dataset_ops.Dataset.from_tensor_slices( + [[k for k in range(i, i + 32)] for i in range(0, 1024, 32)]) # pylint: disable=g-complex-comprehension + rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4) + self.assertEqual([[32]], [ts.as_list() for ts in _flat_shapes(dataset)]) + self.assertEqual([[8]], + [ts.as_list() for ts in _flat_shapes(rebatched_dataset)]) + + expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)] # pylint: disable=g-complex-comprehension + self.assertDatasetProduces(rebatched_dataset, expected_output) + + def testWithUnhandledTransformation(self): + dataset = dataset_ops.Dataset.range(1024).batch( + 32, drop_remainder=True).apply(sleep.sleep(10)) + rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4) + self.assertEqual([[32]], [ts.as_list() for ts in _flat_shapes(dataset)]) + self.assertEqual([[8]], + [ts.as_list() for ts in _flat_shapes(rebatched_dataset)]) + + expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)] # pylint: disable=g-complex-comprehension + self.assertDatasetProduces(rebatched_dataset, expected_output) + + def testWithUnhandledTransformationInFlatMap(self): + dataset = dataset_ops.Dataset.range(2).flat_map( + lambda _: dataset_ops.Dataset.range(32).batch( # pylint: disable=g-long-lambda + 32, drop_remainder=True).apply(sleep.sleep(10))) + rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4) + + self.assertEqual([[8]], + [ts.as_list() for ts in _flat_shapes(rebatched_dataset)]) + + # Two elements where each element is a list of 4 elements where each element + # is a list of 8. + expected_output = [ + [k for k in range(i, i + 8)] # pylint: disable=g-complex-comprehension + for _ in range(2) for i in range(0, 32, 8)] # generates 4 elements + self.assertDatasetProduces(rebatched_dataset, expected_output) + + def testWithUnknownBatchDim(self): + dataset = dataset_ops.Dataset.range(1024).batch( + 32, drop_remainder=False).apply(sleep.sleep(10)) + + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Cannot use rebatching fallback"): + rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4) + next_element = self.getNext(rebatched_dataset) + self.evaluate(next_element()) + + def testWithUnknownBatchDimInSecondComponent(self): + dataset0 = dataset_ops.Dataset.range(1024).batch(32, drop_remainder=True) + dataset1 = dataset_ops.Dataset.range(1024).batch( + 32, drop_remainder=False).apply(sleep.sleep(10)) + dataset = dataset_ops.Dataset.zip((dataset0, dataset1)) + + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Cannot use rebatching fallback"): + rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=4) + next_element = self.getNext(rebatched_dataset) + self.evaluate(next_element()) + + def testBatchSizeIndivisibleByNumWorkers(self): + # This doesn't work; reshape requires tensor shape to be exactly divisible + # by the second dim. + dataset = dataset_ops.Dataset.range(64).batch( + 32, drop_remainder=True).apply(sleep.sleep(10)) + + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Cannot use rebatching fallback"): + rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=5) + next_element = self.getNext(rebatched_dataset) + self.evaluate(next_element()) + + def testBatchSizesDontMatch(self): + dataset = dataset_ops.Dataset.from_tensors((np.arange(10), np.arange(5))) + with self.assertRaisesRegexp(errors.InvalidArgumentError, + "Cannot use rebatching fallback"): + rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=5) + next_element = self.getNext(rebatched_dataset) + self.evaluate(next_element()) + + if __name__ == "__main__": test.main() diff --git a/tensorflow/python/data/experimental/ops/distribute.py b/tensorflow/python/data/experimental/ops/distribute.py index deeaa5f9fbe..b834fe8839a 100644 --- a/tensorflow/python/data/experimental/ops/distribute.py +++ b/tensorflow/python/data/experimental/ops/distribute.py @@ -76,7 +76,7 @@ def _AutoShardDatasetV1(input_dataset, num_workers, index): # pylint: disable=i class _RebatchDataset(dataset_ops.UnaryDataset): """A `Dataset` that divides the batch size by `num_workers`.""" - def __init__(self, input_dataset, num_workers): + def __init__(self, input_dataset, num_workers, use_fallback=True): self._input_dataset = input_dataset def recalculate_output_shapes(output_shapes): @@ -96,7 +96,13 @@ class _RebatchDataset(dataset_ops.UnaryDataset): self._element_spec = structure.convert_legacy_structure( input_types, output_shapes, input_classes) - if compat.forward_compatible(2019, 8, 3): + if compat.forward_compatible(2019, 8, 13) or not use_fallback: + variant_tensor = ged_ops.rebatch_dataset( + self._input_dataset._variant_tensor, # pylint: disable=protected-access + num_workers=num_workers, + use_fallback=use_fallback, + **self._flat_structure) + elif compat.forward_compatible(2019, 8, 3): variant_tensor = ged_ops.rebatch_dataset( self._input_dataset._variant_tensor, # pylint: disable=protected-access num_workers=num_workers, diff --git a/tensorflow/python/keras/distribute/distribute_strategy_test.py b/tensorflow/python/keras/distribute/distribute_strategy_test.py index 9592b299c87..8278b6bef02 100644 --- a/tensorflow/python/keras/distribute/distribute_strategy_test.py +++ b/tensorflow/python/keras/distribute/distribute_strategy_test.py @@ -1055,28 +1055,23 @@ class TestDistributionStrategyWithDatasets(test.TestCase, ], mode=['graph', 'eager'], run_distributed=[True, False])) - def test_dataset_no_batch_input_validation(self, distribution, - run_distributed, mode): - if mode == 'graph': - self.skipTest( - 'TODO(b/120943676, b/120957836): Re-enable for graph once the ' - 'validation code is restored.' - ) + def test_dataset_external_batch_input_validation(self, distribution, + run_distributed): with self.cached_session(): with distribution.scope(): + optimizer_fn = gradient_descent_keras.SGD + optimizer = optimizer_fn(learning_rate=0.001) model = get_model() - optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001) loss = 'mse' model.compile(optimizer, loss, run_distributed=run_distributed) - # User forgets to batch the dataset - inputs = np.zeros((10, 6), dtype=np.float32) - targets = np.zeros((10, 4), dtype=np.float32) + # Batching is done outside tf.data's `batch` + inputs = np.zeros((100, 10, 3), dtype=np.float32) + targets = np.zeros((100, 10, 4), dtype=np.float32) dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets)) dataset = dataset.repeat(100) - with self.assertRaisesRegexp(ValueError, 'Call.*batch.*on.*Dataset'): - model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0) + model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1) @combinations.generate( combinations.combine( diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt index c247479d35e..fac6284ec44 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt @@ -1254,7 +1254,7 @@ tf_module { } member_method { name: "ExperimentalRebatchDataset" - argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\', \'use_fallback\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], " } member_method { name: "ExperimentalScanDataset" @@ -2978,7 +2978,7 @@ tf_module { } member_method { name: "RebatchDataset" - argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\', \'use_fallback\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], " } member_method { name: "Reciprocal" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt index c247479d35e..fac6284ec44 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt @@ -1254,7 +1254,7 @@ tf_module { } member_method { name: "ExperimentalRebatchDataset" - argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\', \'use_fallback\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], " } member_method { name: "ExperimentalScanDataset" @@ -2978,7 +2978,7 @@ tf_module { } member_method { name: "RebatchDataset" - argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + argspec: "args=[\'input_dataset\', \'num_workers\', \'output_types\', \'output_shapes\', \'use_fallback\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], " } member_method { name: "Reciprocal" From edd5e4285b3fcf7baaa095beb3c0c0955a2a61ef Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 07:22:12 -0700 Subject: [PATCH 0292/3053] Update ops-related pbtxt files. PiperOrigin-RevId: 259322305 --- .../core/ops/compat/ops_history.v1.pbtxt | 53 +++++++++++++++++++ tensorflow/core/ops/ops.pbtxt | 53 +++++++++++++++++++ 2 files changed, 106 insertions(+) diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index db25c1d0f6a..bdd96b5179e 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -20094,6 +20094,59 @@ op { } } } +op { + name: "CumulativeLogsumexp" + input_arg { + name: "x" + type_attr: "T" + } + input_arg { + name: "axis" + type_attr: "Tidx" + } + output_arg { + name: "out" + type_attr: "T" + } + attr { + name: "exclusive" + type: "bool" + default_value { + b: false + } + } + attr { + name: "reverse" + type: "bool" + default_value { + b: false + } + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_HALF + type: DT_FLOAT + type: DT_DOUBLE + } + } + } + attr { + name: "Tidx" + type: "type" + default_value { + type: DT_INT32 + } + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } +} op { name: "DataFormatDimMap" input_arg { diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index 30c638c9462..a29e37e01de 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -9498,6 +9498,59 @@ op { } } } +op { + name: "CumulativeLogsumexp" + input_arg { + name: "x" + type_attr: "T" + } + input_arg { + name: "axis" + type_attr: "Tidx" + } + output_arg { + name: "out" + type_attr: "T" + } + attr { + name: "exclusive" + type: "bool" + default_value { + b: false + } + } + attr { + name: "reverse" + type: "bool" + default_value { + b: false + } + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_HALF + type: DT_FLOAT + type: DT_DOUBLE + } + } + } + attr { + name: "Tidx" + type: "type" + default_value { + type: DT_INT32 + } + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } +} op { name: "DataFormatDimMap" input_arg { From 6ee9368fe66f86d89e96487bfad5a8d292d93231 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 09:32:33 -0700 Subject: [PATCH 0293/3053] Update ops-related pbtxt files. PiperOrigin-RevId: 259344498 --- .../core/ops/compat/ops_history.v1.pbtxt | 68 +++++++++++++++++++ tensorflow/core/ops/ops.pbtxt | 14 ++++ 2 files changed, 82 insertions(+) diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index bdd96b5179e..13a1cb8e3bf 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -26449,6 +26449,40 @@ op { minimum: 1 } } +op { + name: "ExperimentalRebatchDataset" + input_arg { + name: "input_dataset" + type: DT_VARIANT + } + input_arg { + name: "num_workers" + type: DT_INT64 + } + output_arg { + name: "handle" + type: DT_VARIANT + } + attr { + name: "output_types" + type: "list(type)" + has_minimum: true + minimum: 1 + } + attr { + name: "output_shapes" + type: "list(shape)" + has_minimum: true + minimum: 1 + } + attr { + name: "use_fallback" + type: "bool" + default_value { + b: true + } + } +} op { name: "ExperimentalScanDataset" input_arg { @@ -57849,6 +57883,40 @@ op { minimum: 1 } } +op { + name: "RebatchDataset" + input_arg { + name: "input_dataset" + type: DT_VARIANT + } + input_arg { + name: "num_workers" + type: DT_INT64 + } + output_arg { + name: "handle" + type: DT_VARIANT + } + attr { + name: "output_types" + type: "list(type)" + has_minimum: true + minimum: 1 + } + attr { + name: "output_shapes" + type: "list(shape)" + has_minimum: true + minimum: 1 + } + attr { + name: "use_fallback" + type: "bool" + default_value { + b: true + } + } +} op { name: "Reciprocal" input_arg { diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index a29e37e01de..64bdb7c3253 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -13145,6 +13145,13 @@ op { has_minimum: true minimum: 1 } + attr { + name: "use_fallback" + type: "bool" + default_value { + b: true + } + } } op { name: "ExperimentalScanDataset" @@ -31084,6 +31091,13 @@ op { has_minimum: true minimum: 1 } + attr { + name: "use_fallback" + type: "bool" + default_value { + b: true + } + } } op { name: "Reciprocal" From fddb746b09f74c8dcf07a61de3e3f89292e89ed5 Mon Sep 17 00:00:00 2001 From: Ihor Indyk Date: Mon, 22 Jul 2019 09:47:32 -0700 Subject: [PATCH 0294/3053] [tf.data] Adding a benchmark to evaluate autotuning of a combination of `map_and_batch` and `interleave` transformations. PiperOrigin-RevId: 259347122 --- .../benchmarks/autotune_benchmark.py | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py index af7c4736083..a6ee0d7dec7 100644 --- a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py +++ b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py @@ -230,6 +230,71 @@ class AutotuneBenchmark(test.Benchmark): (("_autotune_%s" % algorithm.name) if autotune else "")) return np.median(deltas) + def benchmark_map_batch_and_interleave(self): + a = self._benchmark_map_batch_and_interleave(autotune=False) + b = self._benchmark_map_batch_and_interleave(autotune=True) + c = self._benchmark_map_batch_and_interleave( + autotune=True, algorithm=dataset_ops.AutotuneAlgorithm.GRADIENT_DESCENT) + print("HillClimb vs Default speedup: %f" % (a / b)) + print("GradientDescent vs Default speedup: %f" % (a / c)) + + def _benchmark_map_batch_and_interleave( + self, autotune, algorithm=dataset_ops.AutotuneAlgorithm.HILL_CLIMB): + batch_size = 16 + k = 1024 * 1024 + a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1)) + b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1)) + c = (np.random.rand(1, 2 * k), np.random.rand(2 * k, 1)) + dataset_a = dataset_ops.Dataset.from_tensors(a).repeat() + dataset_b = dataset_ops.Dataset.from_tensors(b).repeat() + dataset_c = dataset_ops.Dataset.from_tensors(c).repeat() + + dataset = dataset_a + dataset = dataset.map( + math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE) + dataset = dataset.batch(batch_size=batch_size) + dataset = dataset_ops.Dataset.range(1).repeat().interleave( + lambda _: dataset, + num_parallel_calls=dataset_ops.AUTOTUNE, + cycle_length=2) + + dataset = dataset_ops.Dataset.zip((dataset, dataset_b)) + dataset = dataset_ops.Dataset.range(1).repeat().interleave( + lambda _: dataset, + num_parallel_calls=dataset_ops.AUTOTUNE, + cycle_length=2) + + dataset_c = dataset_c.map( + math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE) + dataset_c = dataset_c.batch(batch_size=batch_size) + dataset = dataset_ops.Dataset.zip((dataset, dataset_c)) + options = dataset_ops.Options() + options.experimental_optimization.apply_default_optimizations = False + options.experimental_optimization.map_and_batch_fusion = True + options.experimental_optimization.autotune = autotune + if autotune: + options.experimental_optimization.autotune_algorithm = algorithm.value + dataset = dataset.with_options(options) + iterator = dataset_ops.make_one_shot_iterator(dataset) + get_next = iterator.get_next() + + deltas = [] + with session.Session() as sess: + for _ in range(5): + sess.run(get_next) + for _ in range(1000): + start = time.time() + sess.run(get_next) + end = time.time() + deltas.append(end - start) + + self.report_benchmark( + iters=1000, + wall_time=np.median(deltas), + name="map_batch_and_interleave" + + (("_autotune_%s" % algorithm.name) if autotune else "")) + return np.median(deltas) + if __name__ == "__main__": test.main() From 4b64dda76688a3a5b6de34126425138a3399d4d2 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Fri, 19 Jul 2019 22:18:38 -0700 Subject: [PATCH 0295/3053] Refactor AutoShardDatasetOp --- .../core/kernels/data/experimental/BUILD | 18 ++++ .../experimental/auto_shard_dataset_op.cc | 98 ++++++++++--------- .../data/experimental/auto_shard_dataset_op.h | 48 +++++++++ .../auto_shard_dataset_op_test.cc | 0 4 files changed, 116 insertions(+), 48 deletions(-) create mode 100644 tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h create mode 100644 tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD index 2ff370e92a6..e209cdc0b70 100644 --- a/tensorflow/core/kernels/data/experimental/BUILD +++ b/tensorflow/core/kernels/data/experimental/BUILD @@ -45,6 +45,7 @@ tf_cc_test( tf_kernel_library( name = "auto_shard_dataset_op", srcs = ["auto_shard_dataset_op.cc"], + hdrs = ["auto_shard_dataset_op.h"], deps = [ "//tensorflow/core:core_cpu_internal", "//tensorflow/core:dataset_ops_op_lib", @@ -57,6 +58,23 @@ tf_kernel_library( ], ) +tf_cc_test( + name = "auto_shard_dataset_op_test", + size = "small", + srcs = ["auto_shard_dataset_op_test.cc"], + deps = [ + ":auto_shard_dataset_op", + "//tensorflow/core:experimental_dataset_ops_op_lib", + "//tensorflow/core:framework", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + "//tensorflow/core/kernels/data:dataset_test_base", + "//tensorflow/core/kernels/data:shard_dataset_op", + "//third_party/eigen3", + ], +) + tf_kernel_library( name = "choose_fastest_branch_dataset_op", srcs = ["choose_fastest_branch_dataset_op.cc"], diff --git a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc index 6ecea13ed76..79a830ac310 100644 --- a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc +++ b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.cc @@ -12,74 +12,76 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/framework/dataset.h" +#include "tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h" + #include "tensorflow/core/kernels/data/dataset_utils.h" #include "tensorflow/core/protobuf/rewriter_config.pb.h" namespace tensorflow { namespace data { -namespace { + +/* static */ constexpr const char* const AutoShardDatasetOp::kDatasetType; +/* static */ constexpr const char* const AutoShardDatasetOp::kInputDataset; +/* static */ constexpr const char* const AutoShardDatasetOp::kNumWorkers; +/* static */ constexpr const char* const AutoShardDatasetOp::kIndex; +/* static */ constexpr const char* const AutoShardDatasetOp::kOutputTypes; +/* static */ constexpr const char* const AutoShardDatasetOp::kOutputShapes; constexpr char kOptimizerName[] = "tf_auto_shard"; -class AutoShardDatasetOp : public UnaryDatasetOpKernel { - public: - explicit AutoShardDatasetOp(OpKernelConstruction* ctx) - : UnaryDatasetOpKernel(ctx) {} +AutoShardDatasetOp::AutoShardDatasetOp(OpKernelConstruction* ctx) + : UnaryDatasetOpKernel(ctx) {} - protected: - void MakeDataset(OpKernelContext* ctx, DatasetBase* input, - DatasetBase** output) override { - int64 index, num_workers; - OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_workers", &num_workers)); - OP_REQUIRES( - ctx, num_workers > 0, - errors::InvalidArgument("num_workers must be greater than zero.")); +void AutoShardDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input, + DatasetBase** output) { + int64 index, num_workers; + OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kNumWorkers, &num_workers)); + OP_REQUIRES( + ctx, num_workers > 0, + errors::InvalidArgument("num_workers must be greater than zero.")); - OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "index", &index)); - OP_REQUIRES(ctx, index >= 0 && index < num_workers, - errors::InvalidArgument("index must be between 0 and ", - num_workers - 1)); + OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kIndex, &index)); + OP_REQUIRES( + ctx, index >= 0 && index < num_workers, + errors::InvalidArgument("index must be between 0 and ", num_workers - 1)); - auto config_factory = [num_workers, index]() { - return CreateConfig(num_workers, index); - }; + auto config_factory = [num_workers, index]() { + return CreateConfig(num_workers, index); + }; - // We only want to optimize functions for some particular datasets like - // FlatMapDataset, InterleaveDataset etc. So we disable generalized - // function optimization and explicitly handle function modifications - // for those datasets in the rewrite. - OP_REQUIRES_OK(ctx, - RewriteDataset(ctx, input, std::move(config_factory), - /*optimize_function_library=*/false, output)); - } + // We only want to optimize functions for some particular datasets like + // FlatMapDataset, InterleaveDataset etc. So we disable generalized + // function optimization and explicitly handle function modifications + // for those datasets in the rewrite. + OP_REQUIRES_OK(ctx, + RewriteDataset(ctx, input, std::move(config_factory), + /*optimize_function_library=*/false, output)); +} - private: - static RewriterConfig CreateConfig(int64 num_workers, int64 index) { - RewriterConfig rewriter_config; - rewriter_config.set_fail_on_optimizer_errors(true); - rewriter_config.add_optimizers(kOptimizerName); - rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE); - auto custom_optimizer = rewriter_config.add_custom_optimizers(); - custom_optimizer->set_name(kOptimizerName); - AttrValue num_workers_attr; - num_workers_attr.set_i(num_workers); - (*custom_optimizer->mutable_parameter_map())["num_workers"] = - num_workers_attr; +RewriterConfig AutoShardDatasetOp::CreateConfig(int64 num_workers, + int64 index) { + RewriterConfig rewriter_config; + rewriter_config.set_fail_on_optimizer_errors(true); + rewriter_config.add_optimizers(kOptimizerName); + rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE); + auto custom_optimizer = rewriter_config.add_custom_optimizers(); + custom_optimizer->set_name(kOptimizerName); + AttrValue num_workers_attr; + num_workers_attr.set_i(num_workers); + (*custom_optimizer->mutable_parameter_map())[kNumWorkers] = num_workers_attr; - AttrValue index_attr; - index_attr.set_i(index); - (*custom_optimizer->mutable_parameter_map())["index"] = index_attr; + AttrValue index_attr; + index_attr.set_i(index); + (*custom_optimizer->mutable_parameter_map())[kIndex] = index_attr; - return rewriter_config; - } -}; + return rewriter_config; +} +namespace { REGISTER_KERNEL_BUILDER(Name("AutoShardDataset").Device(DEVICE_CPU), AutoShardDatasetOp); REGISTER_KERNEL_BUILDER(Name("ExperimentalAutoShardDataset").Device(DEVICE_CPU), AutoShardDatasetOp); - } // anonymous namespace } // namespace data } // namespace tensorflow diff --git a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h new file mode 100644 index 00000000000..73ab7ad6ab3 --- /dev/null +++ b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h @@ -0,0 +1,48 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_AUTO_SHARD_DATASET_OP_H_ +#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_AUTO_SHARD_DATASET_OP_H_ + +#include "tensorflow/core/framework/dataset.h" + +namespace tensorflow { +namespace data { + +// See documentation in ../../ops/experimental_dataset_ops.cc for a high-level +// description of the following op. + +class AutoShardDatasetOp : public UnaryDatasetOpKernel { + public: + static constexpr const char* const kDatasetType = "AutoShard"; + static constexpr const char* const kInputDataset = "input_dataset"; + static constexpr const char* const kNumWorkers = "num_workers"; + static constexpr const char* const kIndex = "index"; + static constexpr const char* const kOutputTypes = "output_types"; + static constexpr const char* const kOutputShapes = "output_shapes"; + + explicit AutoShardDatasetOp(OpKernelConstruction* ctx); + + protected: + void MakeDataset(OpKernelContext* ctx, DatasetBase* input, + DatasetBase** output) override; + + private: + static RewriterConfig CreateConfig(int64 num_workers, int64 index); +}; + +} // namespace data +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_AUTO_SHARD_DATASET_OP_H_ diff --git a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc new file mode 100644 index 00000000000..e69de29bb2d From 156c44db78bbad9d8f03a49e5b3666b3aa831641 Mon Sep 17 00:00:00 2001 From: Sergei Lebedev Date: Mon, 22 Jul 2019 09:49:07 -0700 Subject: [PATCH 0296/3053] Shuffled _EagerTensorBase methods so that magic methods are in a single block PiperOrigin-RevId: 259347415 --- tensorflow/python/framework/ops.py | 109 ++++++++++++++--------------- 1 file changed, 54 insertions(+), 55 deletions(-) diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index dbb61acbcfc..d19646fc69e 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -759,6 +759,60 @@ class Tensor(_TensorLike): class _EagerTensorBase(Tensor): """Base class for EagerTensor.""" + # __int__, __float__ and __index__ may copy the tensor to CPU and + # only work for scalars; values are cast as per numpy. + def __int__(self): + return int(self._numpy()) + + def __long__(self): + return long(self._numpy()) + + def __float__(self): + return float(self._numpy()) + + def __index__(self): + maybe_arr = self._numpy() + if isinstance(maybe_arr, np.ndarray): + return maybe_arr.__index__() + return int(maybe_arr) # Must be a NumPy scalar. + + def __bool__(self): + return bool(self._numpy()) + + __nonzero__ = __bool__ + + def __format__(self, format_spec): + return self._numpy().__format__(format_spec) + + def __reduce__(self): + return convert_to_tensor, (self._numpy(),) + + def __copy__(self): + # Eager Tensors are immutable so it's safe to return themselves as a copy. + return self + + def __deepcopy__(self, memo): + # Eager Tensors are immutable so it's safe to return themselves as a copy. + del memo + return self + + def __str__(self): + return "tf.Tensor(%s, shape=%s, dtype=%s)" % (numpy_text(self), self.shape, + self.dtype.name) + + def __repr__(self): + return "" % ( + self._id, self.shape, self.dtype.name, numpy_text(self, is_repr=True)) + + def __len__(self): + """Returns the length of the first dimension in the Tensor.""" + if not self.shape.ndims: + raise TypeError("Scalar tensor has no `len()`") + return self._shape_tuple()[0] + + def _numpy(self): + raise NotImplementedError() + @property def dtype(self): # Note: using the intern table directly here as this is @@ -783,32 +837,6 @@ class _EagerTensorBase(Tensor): maybe_arr = self._numpy() # pylint: disable=protected-access return maybe_arr.copy() if isinstance(maybe_arr, np.ndarray) else maybe_arr - # __int__, __float__ and __index__ may copy the tensor to CPU and - # only work for scalars; values are cast as per numpy. - def __int__(self): - return int(self._numpy()) - - def __long__(self): - return long(self._numpy()) - - def __float__(self): - return float(self._numpy()) - - def __index__(self): - maybe_arr = self._numpy() - if isinstance(maybe_arr, np.ndarray): - return maybe_arr.__index__() - return int(maybe_arr) # Must be a NumPy scalar. - - def __format__(self, format_spec): - return self._numpy().__format__(format_spec) - - def __reduce__(self): - return (convert_to_tensor, (self._numpy(),)) - - def _numpy(self): - raise NotImplementedError() - @property def backing_device(self): """Returns the name of the device holding this tensor's memory. @@ -821,15 +849,6 @@ class _EagerTensorBase(Tensor): """ raise NotImplementedError() - def __copy__(self): - # Eager Tensors are immutable so it's safe to return themselves as a copy. - return self - - def __deepcopy__(self, memo): - # Eager Tensors are immutable so it's safe to return themselves as a copy. - del memo - return self - def _datatype_enum(self): raise NotImplementedError() @@ -876,14 +895,6 @@ class _EagerTensorBase(Tensor): def _copy_to_device(self, context, device): # pylint: disable=redefined-outer-name raise NotImplementedError() - def __str__(self): - return "tf.Tensor(%s, shape=%s, dtype=%s)" % (numpy_text(self), self.shape, - self.dtype.name) - - def __repr__(self): - return "" % ( - self._id, self.shape, self.dtype.name, numpy_text(self, is_repr=True)) - @staticmethod def _override_operator(name, func): setattr(_EagerTensorBase, name, func) @@ -942,12 +953,6 @@ class _EagerTensorBase(Tensor): """Returns the number of Tensor dimensions.""" return self.shape.ndims - def __len__(self): - """Returns the length of the first dimension in the Tensor.""" - if not self.shape.ndims: - raise TypeError("Scalar tensor has no `len()`") - return self._shape_tuple()[0] - @deprecation.deprecated(None, "Use tf.identity instead.") def cpu(self): """A copy of this Tensor with contents backed by host memory.""" @@ -967,12 +972,6 @@ class _EagerTensorBase(Tensor): """ return self._copy(context.context(), "GPU:" + str(gpu_index)) - def __bool__(self): - return bool(self._numpy()) - - def __nonzero__(self): - return self.__bool__() - def set_shape(self, shape): if not self.shape.is_compatible_with(shape): raise ValueError( From 0947898a14b96ce8e13d3c581ffb0d5af9608083 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 09:54:55 -0700 Subject: [PATCH 0297/3053] [tf.data] Replacing `parallel_interleave` with `interleave` in the implementation of `TFRecordDataset`, `make_csv_dataset` and `make_batched_features_dataset`. PiperOrigin-RevId: 259348564 --- .../data/experimental/ops/interleave_ops.py | 75 ++++++++++++++++-- .../python/data/experimental/ops/readers.py | 33 +++++--- tensorflow/python/data/ops/readers.py | 79 +++---------------- 3 files changed, 106 insertions(+), 81 deletions(-) diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py index 9c9645c4947..9abf8fb8cb5 100644 --- a/tensorflow/python/data/experimental/ops/interleave_ops.py +++ b/tensorflow/python/data/experimental/ops/interleave_ops.py @@ -20,20 +20,84 @@ from __future__ import print_function from tensorflow.python.compat import compat from tensorflow.python.data.experimental.ops import random_ops from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.data.ops import readers +from tensorflow.python.data.util import convert from tensorflow.python.data.util import nest from tensorflow.python.data.util import structure from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_spec from tensorflow.python.ops import array_ops -from tensorflow.python.ops import gen_experimental_dataset_ops +from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops from tensorflow.python.ops import gen_stateless_random_ops from tensorflow.python.ops import math_ops from tensorflow.python.util import deprecation from tensorflow.python.util.tf_export import tf_export +class _ParallelInterleaveDataset(dataset_ops.UnaryDataset): + """A `Dataset` that maps a function over its input and flattens the result.""" + + def __init__(self, input_dataset, map_func, cycle_length, block_length, + sloppy, buffer_output_elements, prefetch_input_elements): + """See `tf.data.experimental.parallel_interleave()` for details.""" + self._input_dataset = input_dataset + self._map_func = dataset_ops.StructuredFunctionWrapper( + map_func, self._transformation_name(), dataset=input_dataset) + if not isinstance(self._map_func.output_structure, dataset_ops.DatasetSpec): + raise TypeError("`map_func` must return a `Dataset` object.") + self._element_spec = self._map_func.output_structure._element_spec # pylint: disable=protected-access + self._cycle_length = ops.convert_to_tensor( + cycle_length, dtype=dtypes.int64, name="cycle_length") + self._block_length = ops.convert_to_tensor( + block_length, dtype=dtypes.int64, name="block_length") + self._sloppy = ops.convert_to_tensor( + sloppy, dtype=dtypes.bool, name="sloppy") + self._buffer_output_elements = convert.optional_param_to_tensor( + "buffer_output_elements", + buffer_output_elements, + argument_default=2 * block_length) + self._prefetch_input_elements = convert.optional_param_to_tensor( + "prefetch_input_elements", + prefetch_input_elements, + argument_default=2 * cycle_length) + # pylint: disable=protected-access + if compat.forward_compatible(2019, 8, 3): + variant_tensor = ged_ops.parallel_interleave_dataset( + self._input_dataset._variant_tensor, + self._map_func.function.captured_inputs, + self._cycle_length, + self._block_length, + self._sloppy, + self._buffer_output_elements, + self._prefetch_input_elements, + f=self._map_func.function, + **self._flat_structure) + else: + variant_tensor = ged_ops.experimental_parallel_interleave_dataset( + self._input_dataset._variant_tensor, + self._map_func.function.captured_inputs, + self._cycle_length, + self._block_length, + self._sloppy, + self._buffer_output_elements, + self._prefetch_input_elements, + f=self._map_func.function, + **self._flat_structure) + # pylint: enable=protected-access + super(_ParallelInterleaveDataset, self).__init__(input_dataset, + variant_tensor) + + def _functions(self): + return [self._map_func] + + @property + def element_spec(self): + return self._element_spec + + def _transformation_name(self): + return "tf.data.experimental.parallel_interleave()" + + @deprecation.deprecated( None, "Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, " @@ -90,7 +154,7 @@ def parallel_interleave(map_func, `tf.data.Dataset.apply`. """ def _apply_fn(dataset): - return readers.ParallelInterleaveDataset( + return _ParallelInterleaveDataset( dataset, map_func, cycle_length, block_length, sloppy, buffer_output_elements, prefetch_input_elements) @@ -129,13 +193,13 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset): # pylint: disable=protected-access if compat.forward_compatible(2019, 8, 3): return ( - gen_experimental_dataset_ops.directed_interleave_dataset( + ged_ops.directed_interleave_dataset( self._selector_input._variant_tensor, [data_input._variant_tensor for data_input in self._data_inputs], **self._flat_structure)) else: return ( - gen_experimental_dataset_ops.experimental_directed_interleave_dataset( + ged_ops.experimental_directed_interleave_dataset( self._selector_input._variant_tensor, [data_input._variant_tensor for data_input in self._data_inputs], **self._flat_structure)) @@ -294,3 +358,4 @@ choose_from_datasets_v1.__doc__ = choose_from_datasets_v2.__doc__ # these aliases in place. choose_from_datasets = choose_from_datasets_v1 sample_from_datasets = sample_from_datasets_v1 + diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py index cf8b8c7a13e..91ebb5245a9 100644 --- a/tensorflow/python/data/experimental/ops/readers.py +++ b/tensorflow/python/data/experimental/ops/readers.py @@ -26,7 +26,6 @@ import numpy as np from tensorflow.python.compat import compat from tensorflow.python.data.experimental.ops import batching from tensorflow.python.data.experimental.ops import error_ops -from tensorflow.python.data.experimental.ops import interleave_ops from tensorflow.python.data.experimental.ops import parsing_ops from tensorflow.python.data.experimental.ops import shuffle_ops from tensorflow.python.data.ops import dataset_ops @@ -494,9 +493,18 @@ def make_csv_dataset_v2( return features # Read files sequentially (if num_parallel_reads=1) or in parallel - dataset = dataset.apply( - interleave_ops.parallel_interleave( - filename_to_dataset, cycle_length=num_parallel_reads, sloppy=sloppy)) + cycle_length = num_parallel_reads + if num_parallel_reads == dataset_ops.AUTOTUNE: + cycle_length = core_readers.DEFAULT_CYCLE_LENGTH + dataset = dataset.interleave( + filename_to_dataset, + cycle_length, + num_parallel_calls=num_parallel_reads) + + if sloppy: + options = dataset_ops.Options() + options.experimental_deterministic = False + dataset = dataset.with_options(options) dataset = _maybe_shuffle_and_repeat( dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed) @@ -838,11 +846,18 @@ def make_batched_features_dataset_v2(file_pattern, reader_args = [] # Read files sequentially (if reader_num_threads=1) or in parallel - dataset = dataset.apply( - interleave_ops.parallel_interleave( - lambda filename: reader(filename, *reader_args), - cycle_length=reader_num_threads, - sloppy=sloppy_ordering)) + cycle_length = reader_num_threads + if reader_num_threads == dataset_ops.AUTOTUNE: + cycle_length = core_readers.DEFAULT_CYCLE_LENGTH + dataset = dataset.interleave( + lambda filename: reader(filename, *reader_args), + cycle_length, + num_parallel_calls=reader_num_threads) + + if sloppy_ordering: + options = dataset_ops.Options() + options.experimental_deterministic = False + dataset = dataset.with_options(options) # Extract values if the `Example` tensors are stored as key-value tuples. if dataset_ops.get_legacy_output_types(dataset) == ( diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py index a5610cdf7cd..5ece97fd0dd 100644 --- a/tensorflow/python/data/ops/readers.py +++ b/tensorflow/python/data/ops/readers.py @@ -26,13 +26,17 @@ from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import tensor_spec from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_dataset_ops -from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops from tensorflow.python.util.tf_export import tf_export # TODO(b/64974358): Increase default buffer size to 256 MB. _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024 # 256 KB +# If the user requests the degree of interleave parallelism to be autotuned, +# cycle length controls the maximum level of parallelism. We set it to a small +# constant as a tradeoff between effective parallelism and memory and CPU usage. +DEFAULT_CYCLE_LENGTH = 10 + def _create_or_validate_filenames_dataset(filenames): """Creates (or validates) a dataset of filenames. @@ -80,10 +84,13 @@ def _create_dataset_reader(dataset_creator, filenames, num_parallel_reads=None): if num_parallel_reads is None: return filenames.flat_map(read_one_file) else: - return ParallelInterleaveDataset( - filenames, read_one_file, cycle_length=num_parallel_reads, - block_length=1, sloppy=False, buffer_output_elements=None, - prefetch_input_elements=None) + cycle_length = num_parallel_reads + if num_parallel_reads == dataset_ops.AUTOTUNE: + cycle_length = DEFAULT_CYCLE_LENGTH + return filenames.interleave( + read_one_file, + cycle_length, + num_parallel_calls=num_parallel_reads) class _TextLineDataset(dataset_ops.DatasetSource): @@ -213,68 +220,6 @@ class _TFRecordDataset(dataset_ops.DatasetSource): return tensor_spec.TensorSpec([], dtypes.string) -class ParallelInterleaveDataset(dataset_ops.UnaryDataset): - """A `Dataset` that maps a function over its input and flattens the result.""" - - def __init__(self, input_dataset, map_func, cycle_length, block_length, - sloppy, buffer_output_elements, prefetch_input_elements): - """See `tf.data.experimental.parallel_interleave()` for details.""" - self._input_dataset = input_dataset - self._map_func = dataset_ops.StructuredFunctionWrapper( - map_func, self._transformation_name(), dataset=input_dataset) - if not isinstance(self._map_func.output_structure, dataset_ops.DatasetSpec): - raise TypeError("`map_func` must return a `Dataset` object.") - self._element_spec = self._map_func.output_structure._element_spec # pylint: disable=protected-access - self._cycle_length = ops.convert_to_tensor( - cycle_length, dtype=dtypes.int64, name="cycle_length") - self._block_length = ops.convert_to_tensor( - block_length, dtype=dtypes.int64, name="block_length") - self._sloppy = ops.convert_to_tensor( - sloppy, dtype=dtypes.bool, name="sloppy") - self._buffer_output_elements = convert.optional_param_to_tensor( - "buffer_output_elements", - buffer_output_elements, - argument_default=2 * block_length) - self._prefetch_input_elements = convert.optional_param_to_tensor( - "prefetch_input_elements", - prefetch_input_elements, - argument_default=2 * cycle_length) - if compat.forward_compatible(2019, 8, 3): - variant_tensor = ged_ops.parallel_interleave_dataset( - self._input_dataset._variant_tensor, # pylint: disable=protected-access - self._map_func.function.captured_inputs, - self._cycle_length, - self._block_length, - self._sloppy, - self._buffer_output_elements, - self._prefetch_input_elements, - f=self._map_func.function, - **self._flat_structure) - else: - variant_tensor = ged_ops.experimental_parallel_interleave_dataset( - self._input_dataset._variant_tensor, # pylint: disable=protected-access - self._map_func.function.captured_inputs, - self._cycle_length, - self._block_length, - self._sloppy, - self._buffer_output_elements, - self._prefetch_input_elements, - f=self._map_func.function, - **self._flat_structure) - super(ParallelInterleaveDataset, self).__init__(input_dataset, - variant_tensor) - - def _functions(self): - return [self._map_func] - - @property - def element_spec(self): - return self._element_spec - - def _transformation_name(self): - return "tf.data.experimental.parallel_interleave()" - - @tf_export("data.TFRecordDataset", v1=[]) class TFRecordDatasetV2(dataset_ops.DatasetV2): """A `Dataset` comprising records from one or more TFRecord files.""" From cea5f5c65cc6905cb50bf17573b9e8681b460198 Mon Sep 17 00:00:00 2001 From: Bixia Zheng Date: Mon, 22 Jul 2019 09:57:42 -0700 Subject: [PATCH 0298/3053] [XLA:CPU] Fix EmitComplexAbs. Previously, when both the real and imaginary components are infinity, the CPU backend produces NAN. The change is to produce infinity. PiperOrigin-RevId: 259349116 --- .../xla/service/elemental_ir_emitter.cc | 31 ++++++++++--------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc index 48559bf5fc3..517d15f2c34 100644 --- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc @@ -786,22 +786,25 @@ StatusOr ElementalIrEmitter::EmitFloatBinaryOp( // With the assumption that |a| >= |b| StatusOr ElementalIrEmitter::EmitComplexAbs( PrimitiveType prim_type, llvm::Value* operand_value) { - auto real = EmitExtractReal(operand_value); - auto imag = EmitExtractImag(operand_value); - auto abs_real = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {real}, - {real->getType()}, b_); - auto abs_imag = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {imag}, - {imag->getType()}, b_); - auto max = EmitFloatMax(abs_real, abs_imag); - auto min = EmitFloatMin(abs_real, abs_imag); + llvm::Value* real = EmitExtractReal(operand_value); + llvm::Value* imag = EmitExtractImag(operand_value); + llvm::Value* abs_real = llvm_ir::EmitCallToIntrinsic( + llvm::Intrinsic::fabs, {real}, {real->getType()}, b_); + llvm::Value* abs_imag = llvm_ir::EmitCallToIntrinsic( + llvm::Intrinsic::fabs, {imag}, {imag->getType()}, b_); + llvm::Value* max = EmitFloatMax(abs_real, abs_imag); + llvm::Value* min = EmitFloatMin(abs_real, abs_imag); - auto div = FDiv(min, max); - auto div_sq = FMul(div, div); - auto one = llvm::ConstantFP::get(max->getType(), 1); - TF_ASSIGN_OR_RETURN(auto sqrt, EmitSqrt(prim_type, FAdd(one, div_sq))); + llvm::Value* div = FDiv(min, max); + llvm::Value* div_sq = FMul(div, div); + llvm::Value* one = llvm::ConstantFP::get(max->getType(), 1); + TF_ASSIGN_OR_RETURN(llvm::Value * sqrt, + EmitSqrt(prim_type, FAdd(one, div_sq))); - auto zero = llvm::ConstantFP::get(max->getType(), 0); - return Select(FCmpOEQ(max, zero), zero, FMul(max, sqrt)); + llvm::Value* result = FMul(max, sqrt); + // When (min, max) are (0, 0), (inf, inf), or (NaN, ...), result is NaN. + // In such cases, we return min. + return Select(FCmpUNO(result, result), min, result); } // (a+bi)^(c+di) = From d0a243285d36526592b90f9ef1277c2f196bb6a8 Mon Sep 17 00:00:00 2001 From: captain-pool Date: Mon, 22 Jul 2019 23:05:11 +0530 Subject: [PATCH 0299/3053] Fixed Minor Bug --- tensorflow/python/tools/saved_model_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py index 62de9946de2..367670de411 100644 --- a/tensorflow/python/tools/saved_model_cli.py +++ b/tensorflow/python/tools/saved_model_cli.py @@ -193,7 +193,7 @@ def _show_defined_functions(saved_model_dir, indent=0): in_print(' Callable with:') _print_args(args, indent=3) if kwargs: - _print_args(args, "Named Argument", indent=3) + _print_args(kwargs, "Named Argument", indent=3) def _print_args(arguments, argument_type="Argument", indent=0): From 0c7e6972ce7ed604a9dfd2564d98ecb2d0a2dca9 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 10:00:49 -0700 Subject: [PATCH 0300/3053] Remove unused proto imports. PiperOrigin-RevId: 259349684 --- tensorflow/core/protobuf/replay_log.proto | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/protobuf/replay_log.proto b/tensorflow/core/protobuf/replay_log.proto index 7644314fc9d..5506ec0c8ea 100644 --- a/tensorflow/core/protobuf/replay_log.proto +++ b/tensorflow/core/protobuf/replay_log.proto @@ -1,12 +1,11 @@ syntax = "proto3"; -option cc_enable_arenas = true; package tensorflow; -import "tensorflow/core/framework/graph.proto"; -import "tensorflow/core/protobuf/cluster.proto"; import "tensorflow/core/protobuf/master.proto"; +option cc_enable_arenas = true; + // Records the creation of a new replay session. We record the device listing // here to capture the state of the cluster. message NewReplaySession { From fd81388c12119a4835e397f971e9a16448869457 Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Mon, 22 Jul 2019 10:45:13 -0700 Subject: [PATCH 0301/3053] Use ShouldUsePaddedIO() as a helper func --- tensorflow/core/kernels/cudnn_rnn_ops.cc | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc index 1daadd2f9f1..86ba2dbcabe 100644 --- a/tensorflow/core/kernels/cudnn_rnn_ops.cc +++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc @@ -944,6 +944,18 @@ void RestoreParams(const OpInputList params_input, } } +bool ShouldUsePaddedIO(const Tensor* sequence_lengths, bool time_major) { + auto seq_array = sequence_lengths->template flat().data(); + bool all_max_seq_length = true; + for (int i = 0; i < model_shapes.batch_size; i++) { + if (seq_array[i] != model_shapes.max_seq_length) { + all_max_seq_length = false; + break; + } + } + return !(time_major && all_max_seq_length); +} + } // namespace // Note: all following kernels depend on a RnnDescriptor instance, which @@ -1862,15 +1874,7 @@ class CudnnRNNBackwardOp : public CudnnRNNKernelCommon { context, model_types(), time_major, &input, &input_h, &input_c, ¶ms, &sequence_lengths, num_proj, &model_shapes)); - auto seq_array = sequence_lengths->template flat().data(); - bool all_max_seq_length = true; - for (int i = 0; i < model_shapes.batch_size; i++) { - if (seq_array[i] != model_shapes.max_seq_length) { - all_max_seq_length = false; - break; - } - } - use_padded_io = !(time_major && all_max_seq_length); + use_padded_io = ShouldUsePaddedIO(sequence_lengths, time_major); } else { OP_REQUIRES_OK(context, ExtractForwardInput(context, model_types(), time_major, From f30e3c6efd7babf59ee84136d07d386d88b8b772 Mon Sep 17 00:00:00 2001 From: Bhavani Subramanian Date: Fri, 19 Jul 2019 13:53:00 -0700 Subject: [PATCH 0302/3053] Fixed build failure for v1.x --- third_party/mkl_dnn/mkldnn.BUILD | 2 -- 1 file changed, 2 deletions(-) diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD index 6331a108e50..bbcb5bd14a1 100644 --- a/third_party/mkl_dnn/mkldnn.BUILD +++ b/third_party/mkl_dnn/mkldnn.BUILD @@ -62,8 +62,6 @@ cc_library( "src/cpu/xbyak/*.h", ]) + if_mkl_v1_open_source_only([ ":mkldnn_config_h", - "src/cpu/jit_utils/jit_utils.cpp", - "src/cpu/jit_utils/jit_utils.hpp", ]) + [":mkldnn_version_h"], hdrs = glob(["include/*"]), copts = [ From 3b8bc0a129e7a6e1a8aa08bb30901033ab9fda00 Mon Sep 17 00:00:00 2001 From: Bixia Zheng Date: Mon, 22 Jul 2019 10:28:48 -0700 Subject: [PATCH 0303/3053] [XLA] Replace +/-Inf with +/-Max when calculating absolute or relative errors. This results in more meaningful absolute or relative errors. PiperOrigin-RevId: 259355987 --- .../xla/tests/exhaustive_op_test_utils.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h b/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h index 956e1694fb7..ad42779ddc7 100644 --- a/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h +++ b/tensorflow/compiler/xla/tests/exhaustive_op_test_utils.h @@ -114,8 +114,12 @@ class ExhaustiveOpTestBase : public ClientLibraryTestBase { static_assert( std::is_same::value || std::is_same::value, "Only supports float and double."); - T abs_err = std::abs(expected - actual); - T rel_err = abs_err / std::abs(expected); + // Replace Inf with Max when calculating absolute or relative errors. This + // allows the test to pass when another value are close to Inf and the + // specified absolute or relative errors are not zero. + T abs_err = + std::abs(ReplaceInfWithMax(expected) - ReplaceInfWithMax(actual)); + T rel_err = abs_err / std::abs(ReplaceInfWithMax(expected)); if (spec.strict_signed_zeros && actual == T{0} && expected == T{0}) { // Check sign of zero. return std::signbit(actual) == std::signbit(expected); @@ -211,6 +215,16 @@ class ExhaustiveOpTestBase : public ClientLibraryTestBase { static std::vector> CreateExhaustiveF32Ranges(); + private: + template + T ReplaceInfWithMax(T value) { + if (std::isinf(value)) { + return std::copysign(std::numeric_limits::max(), value); + } + + return value; + } + protected: // The primitive type under test. const PrimitiveType ty_; From d3d4ee7fce00db001717c6ee560fe46b5b6ab618 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Fri, 19 Jul 2019 22:19:07 -0700 Subject: [PATCH 0304/3053] Add tests for AutoShardDatasetOp --- .../experimental/assert_next_dataset_op.cc | 1 - .../auto_shard_dataset_op_test.cc | 282 ++++++++++++++++++ 2 files changed, 282 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc index 592d8db8281..8171bb6ae75 100644 --- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc +++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc @@ -16,7 +16,6 @@ limitations under the License. #include -#include "tensorflow/core/framework/dataset.h" #include "tensorflow/core/framework/partial_tensor_shape.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/kernels/data/name_utils.h" diff --git a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc index e69de29bb2d..33546416e56 100644 --- a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc +++ b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc @@ -0,0 +1,282 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/core/kernels/data/experimental/auto_shard_dataset_op.h" + +#include "tensorflow/core/kernels/data/dataset_test_base.h" +#include "tensorflow/core/kernels/data/shard_dataset_op.h" + +namespace tensorflow { +namespace data { +namespace { + +constexpr char kNodeName[] = "auto_shard_dataset"; +constexpr char kIteratorPrefix[] = "Iterator"; + +class AutoShardDatasetOpTest : public DatasetOpsTestBase { + protected: + // Creates a new `AutoShardDataset` op kernel. + Status CreateAutoShardDatasetOpKernel( + const DataTypeVector& output_types, + const std::vector& output_shapes, + std::unique_ptr* op_kernel) { + NodeDef node_def = test::function::NDef( + kNodeName, name_utils::OpName(AutoShardDatasetOp::kDatasetType), + {AutoShardDatasetOp::kInputDataset, AutoShardDatasetOp::kNumWorkers, + AutoShardDatasetOp::kIndex}, + {{AutoShardDatasetOp::kOutputTypes, output_types}, + {AutoShardDatasetOp::kOutputShapes, output_shapes}}); + TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel)); + return Status::OK(); + } + + // Create a new `AutoShardDataset` op kernel context + Status CreateAutoShardDatasetContext( + OpKernel* const op_kernel, + gtl::InlinedVector* const inputs, + std::unique_ptr* context) { + TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs)); + TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context)); + return Status::OK(); + } +}; + +struct RangeDatasetParams { + int64 start; + int64 stop; + int64 step; +}; + +struct TestCase { + RangeDatasetParams range_dataset_param; + Tensor num_workers; + Tensor index; + std::vector expected_outputs; + DataTypeVector expected_output_dtypes; + std::vector expected_output_shapes; + int64 expected_cardinality; + std::vector breakpoints; +}; + +// Test Case 1: simple case. +TestCase TestCase1() { + return {/*range_data_param*/ {0, 10, 1}, + /*num_workers*/ + DatasetOpsTestBase::CreateTensor(TensorShape({}), {5}), + /*index*/ + DatasetOpsTestBase::CreateTensor(TensorShape({}), {2}), + /*expected_outputs*/ + {DatasetOpsTestBase::CreateTensor(TensorShape({}), {2}), + DatasetOpsTestBase::CreateTensor(TensorShape({}), {7})}, + /*expected_output_dtypes*/ {DT_INT64}, + /*expected_output_shapes*/ {PartialTensorShape({})}, + /*expected_cardinality*/ 2, + /*breakpoints*/ {0, 1, 5}}; +} + +// Test Case 2: the index is larger than the available elements. +TestCase TestCase2() { + return {/*range_data_param*/ {0, 1, 1}, + /*num_workers*/ + DatasetOpsTestBase::CreateTensor(TensorShape({}), {5}), + /*index*/ + DatasetOpsTestBase::CreateTensor(TensorShape({}), {2}), + /*expected_outputs*/ {}, + /*expected_output_dtypes*/ {DT_INT64}, + /*expected_output_shapes*/ {PartialTensorShape({})}, + /*expected_cardinality*/ 0, + /*breakpoints*/ {0, 1}}; +} + +// Test Case 3: the number of outputs could not be evenly divided by +// num_workers. +TestCase TestCase3() { + return {/*range_data_param*/ {0, 10, 1}, + /*num_workers*/ + DatasetOpsTestBase::CreateTensor(TensorShape({}), {4}), + /*index*/ + DatasetOpsTestBase::CreateTensor(TensorShape({}), {3}), + /*expected_outputs*/ + {DatasetOpsTestBase::CreateTensor(TensorShape({}), {3}), + DatasetOpsTestBase::CreateTensor(TensorShape({}), {7})}, + /*expected_output_dtypes*/ {DT_INT64}, + /*expected_output_shapes*/ {PartialTensorShape({})}, + /*expected_cardinality*/ 2, + /*breakpoints*/ {0, 1, 5}}; +} + +// TODO(feihugis): add more test cases that have ReaderDatasets (e.g. a +// CSVDataset or a TFRecordDataset) in the pipeline. + +TestCase IndexGreaterNumWorkersCase() { + return {/*range_data_param*/ {0, 10, 1}, + /*num_workers*/ + DatasetOpsTestBase::CreateTensor(TensorShape({}), {5}), + /*index*/ + DatasetOpsTestBase::CreateTensor(TensorShape({}), {7}), + /*expected_outputs*/ {}, + /*expected_output_dtypes*/ {DT_INT64}, + /*expected_output_shapes*/ {PartialTensorShape({})}, + /*expected_cardinality*/ 0, + /*breakpoints*/ {}}; +} + +TestCase NegativeIndexTestCase() { + return {/*range_data_param*/ {0, 10, 1}, + /*num_workers*/ + DatasetOpsTestBase::CreateTensor(TensorShape({}), {5}), + /*index*/ + DatasetOpsTestBase::CreateTensor(TensorShape({}), {-3}), + /*expected_outputs*/ {}, + /*expected_output_dtypes*/ {DT_INT64}, + /*expected_output_shapes*/ {PartialTensorShape({})}, + /*expected_cardinality*/ 0, + /*breakpoints*/ {}}; +} + +TestCase NegativeNumWorkersTestCase() { + return {/*range_data_param*/ {0, 10, 1}, + /*num_workers*/ + DatasetOpsTestBase::CreateTensor(TensorShape({}), {-3}), + /*index*/ + DatasetOpsTestBase::CreateTensor(TensorShape({}), {1}), + /*expected_outputs*/ {}, + /*expected_output_dtypes*/ {DT_INT64}, + /*expected_output_shapes*/ {PartialTensorShape({})}, + /*expected_cardinality*/ 0, + /*breakpoints*/ {}}; +} + +TestCase ZeroNumWorkersTestCase() { + return {/*range_data_param*/ {0, 10, 1}, + /*num_workers*/ + DatasetOpsTestBase::CreateTensor(TensorShape({}), {0}), + /*index*/ + DatasetOpsTestBase::CreateTensor(TensorShape({}), {1}), + /*expected_outputs*/ {}, + /*expected_output_dtypes*/ {DT_INT64}, + /*expected_output_shapes*/ {PartialTensorShape({})}, + /*expected_cardinality*/ 0, + /*breakpoints*/ {}}; +} + +class ParameterizedAutoShardDatasetOpTest + : public AutoShardDatasetOpTest, + public ::testing::WithParamInterface {}; + +TEST_P(ParameterizedAutoShardDatasetOpTest, GetNext) { + int thread_num = 2, cpu_num = 2; + TestCase test_case = GetParam(); + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + std::unique_ptr auto_shard_dataset_kernel; + TF_ASSERT_OK(CreateAutoShardDatasetOpKernel(test_case.expected_output_dtypes, + test_case.expected_output_shapes, + &auto_shard_dataset_kernel)); + + Tensor start = CreateTensor(TensorShape({}), + {test_case.range_dataset_param.start}); + Tensor stop = CreateTensor(TensorShape({}), + {test_case.range_dataset_param.stop}); + Tensor step = CreateTensor(TensorShape({}), + {test_case.range_dataset_param.step}); + Tensor range_dataset_tensor(DT_VARIANT, TensorShape({})); + TF_ASSERT_OK(MakeRangeDataset(start, stop, step, {DT_INT64}, + {TensorShape({})}, &range_dataset_tensor)); + + Tensor num_workers = test_case.num_workers; + Tensor index = test_case.index; + gtl::InlinedVector inputs({TensorValue(&range_dataset_tensor), + TensorValue(&num_workers), + TensorValue(&index)}); + std::unique_ptr auto_shard_dataset_context; + TF_ASSERT_OK(CreateAutoShardDatasetContext( + auto_shard_dataset_kernel.get(), &inputs, &auto_shard_dataset_context)); + + DatasetBase* auto_shard_dataset; + TF_ASSERT_OK(CreateDataset(auto_shard_dataset_kernel.get(), + auto_shard_dataset_context.get(), + &auto_shard_dataset)); + core::ScopedUnref scoped_unref_auto_shard_dataset(auto_shard_dataset); + + std::unique_ptr iterator_ctx; + TF_ASSERT_OK( + CreateIteratorContext(auto_shard_dataset_context.get(), &iterator_ctx)); + std::unique_ptr iterator; + TF_ASSERT_OK(auto_shard_dataset->MakeIterator(iterator_ctx.get(), + kIteratorPrefix, &iterator)); + + bool end_of_sequence = false; + auto expected_outputs_it = test_case.expected_outputs.begin(); + std::vector out_tensors; + while (!end_of_sequence) { + TF_EXPECT_OK( + iterator->GetNext(iterator_ctx.get(), &out_tensors, &end_of_sequence)); + if (!end_of_sequence) { + EXPECT_LT(expected_outputs_it, test_case.expected_outputs.end()); + TF_EXPECT_OK(ExpectEqual(out_tensors.back(), *expected_outputs_it)); + expected_outputs_it++; + } + } + EXPECT_EQ(expected_outputs_it, test_case.expected_outputs.end()); +} + +INSTANTIATE_TEST_SUITE_P(AutoShardDatasetOpTest, + ParameterizedAutoShardDatasetOpTest, + ::testing::ValuesIn(std::vector( + {TestCase1(), TestCase2(), TestCase3()}))); + +TEST_F(AutoShardDatasetOpTest, InvalidArguments) { + int thread_num = 2, cpu_num = 2; + TF_ASSERT_OK(InitThreadPool(thread_num)); + TF_ASSERT_OK(InitFunctionLibraryRuntime({}, cpu_num)); + + std::vector test_cases = { + IndexGreaterNumWorkersCase(), NegativeIndexTestCase(), + NegativeNumWorkersTestCase(), ZeroNumWorkersTestCase()}; + for (const auto& test_case : test_cases) { + std::unique_ptr auto_shard_dataset_kernel; + TF_ASSERT_OK(CreateAutoShardDatasetOpKernel( + test_case.expected_output_dtypes, test_case.expected_output_shapes, + &auto_shard_dataset_kernel)); + + Tensor start = CreateTensor(TensorShape({}), + {test_case.range_dataset_param.start}); + Tensor stop = CreateTensor(TensorShape({}), + {test_case.range_dataset_param.stop}); + Tensor step = CreateTensor(TensorShape({}), + {test_case.range_dataset_param.step}); + Tensor range_dataset_tensor(DT_VARIANT, TensorShape({})); + TF_ASSERT_OK(MakeRangeDataset(start, stop, step, {DT_INT64}, + {TensorShape({})}, &range_dataset_tensor)); + + Tensor num_workers = test_case.num_workers; + Tensor index = test_case.index; + gtl::InlinedVector inputs( + {TensorValue(&range_dataset_tensor), TensorValue(&num_workers), + TensorValue(&index)}); + std::unique_ptr auto_shard_dataset_context; + TF_ASSERT_OK(CreateAutoShardDatasetContext( + auto_shard_dataset_kernel.get(), &inputs, &auto_shard_dataset_context)); + + DatasetBase* auto_shard_dataset; + EXPECT_EQ( + CreateDataset(auto_shard_dataset_kernel.get(), + auto_shard_dataset_context.get(), &auto_shard_dataset) + .code(), + tensorflow::error::INVALID_ARGUMENT); + } +} + +} // namespace +} // namespace data +} // namespace tensorflow From 0ac335956ce5ab4cf1cc3a170d2decb6d601cd6a Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Mon, 22 Jul 2019 17:38:56 +0000 Subject: [PATCH 0305/3053] Fix failing test in python 3 where by default byte (instead of string) is used Signed-off-by: Yong Tang --- tensorflow/python/data/experimental/ops/readers.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py index 6a496ba357a..ae20b5e1cd7 100644 --- a/tensorflow/python/data/experimental/ops/readers.py +++ b/tensorflow/python/data/experimental/ops/readers.py @@ -431,6 +431,12 @@ def make_csv_dataset_v2( dataset = dataset.shuffle(len(filenames), shuffle_seed) # Clean arguments; figure out column names and defaults + def gzip_file_io_open(filename, mode): + # By default, gzip will open in byte mode which will + # not work with csv.reader so we create a wrapper to + # append `t`. + mode = mode + "t" if "t" not in mode else mode + return gzip.open(filename, mode) if column_names is None or column_defaults is None: # Find out which io function to open the file file_io_fn = file_io.FileIO @@ -439,7 +445,7 @@ def make_csv_dataset_v2( if compression_type_value is None: raise ValueError("Received unkown compression_type") if compression_type_value == "GZIP": - file_io_fn = gzip.GzipFile + file_io_fn = gzip_file_io_open elif compression_type_value == "ZLIB": raise ValueError( "compression_type (%s) is not supported for probing columns" % From c56113eb3bae2f3adc1b3cba466d1b2884b27b87 Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Mon, 22 Jul 2019 11:09:51 -0700 Subject: [PATCH 0306/3053] minor changes --- tensorflow/core/kernels/cudnn_rnn_ops.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc index 86ba2dbcabe..bd282d815bf 100644 --- a/tensorflow/core/kernels/cudnn_rnn_ops.cc +++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc @@ -944,7 +944,9 @@ void RestoreParams(const OpInputList params_input, } } -bool ShouldUsePaddedIO(const Tensor* sequence_lengths, bool time_major) { +bool ShouldUsePaddedIO(const Tensor* sequence_lengths, + const CudnnRnnModelShapes& model_shapes, + bool time_major) { auto seq_array = sequence_lengths->template flat().data(); bool all_max_seq_length = true; for (int i = 0; i < model_shapes.batch_size; i++) { @@ -1874,7 +1876,8 @@ class CudnnRNNBackwardOp : public CudnnRNNKernelCommon { context, model_types(), time_major, &input, &input_h, &input_c, ¶ms, &sequence_lengths, num_proj, &model_shapes)); - use_padded_io = ShouldUsePaddedIO(sequence_lengths, time_major); + use_padded_io = ShouldUsePaddedIO(sequence_lengths, model_shapes, + time_major); } else { OP_REQUIRES_OK(context, ExtractForwardInput(context, model_types(), time_major, From b3ccc749b87a7a6c298e10f66e25f6894c63e87d Mon Sep 17 00:00:00 2001 From: Andrew Lihonosov Date: Mon, 22 Jul 2019 21:20:11 +0300 Subject: [PATCH 0307/3053] Fix large (>4GB) files reading on windows There is a bug which prevents reading files larger than 4GB on Windows. TF uses ::ReadFile winapi function (see pread in windows_file_system.cc) This function accepts requested bytes number as DWORD, which is 32 bit on both 32bit and 64bit systems. But WindowsRandomAccessFile::Read passes number of bytes as size_t which is 64 bit on 64 bit systems. Then there is a staic_cast from 64 bit size_t to 32 bit DWORD, which causes the error. Changed to read such files in portions of no more than std::numeric_limits::max() bytes. --- tensorflow/core/platform/windows/windows_file_system.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc index 8580c3a3efb..14543c29f52 100644 --- a/tensorflow/core/platform/windows/windows_file_system.cc +++ b/tensorflow/core/platform/windows/windows_file_system.cc @@ -122,7 +122,13 @@ class WindowsRandomAccessFile : public RandomAccessFile { Status s; char* dst = scratch; while (n > 0 && s.ok()) { - SSIZE_T r = pread(hfile_, dst, n, offset); + size_t requested_read_length; + if (n > std::numeric_limits::max()) { + requested_read_length = std::numeric_limits::max(); + } else { + requested_read_length = n; + } + SSIZE_T r = pread(hfile_, dst, requested_read_length, offset); if (r > 0) { offset += r; dst += r; From 30f6e97551a58ede41205bd96c18424ec78b9354 Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Mon, 22 Jul 2019 10:29:00 -0700 Subject: [PATCH 0308/3053] [XLA] Clarify padding semantics for reduce window PiperOrigin-RevId: 259356026 --- .../compiler/xla/g3doc/operation_semantics.md | 19 +++++++++++++++++-- .../compiler/xla/tests/reduce_window_test.cc | 9 +++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md index d6c99580c39..7bf48d53f70 100644 --- a/tensorflow/compiler/xla/g3doc/operation_semantics.md +++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md @@ -1980,8 +1980,12 @@ window_strides, padding)` | `window_dilations` | `ArraySlice` | array of integers for window | : : : dilation values : | `padding` | `Padding` | padding type for window | -: : : (Padding\:\:kSame or : -: : : Padding\:\:kValid) : +: : : (Padding\:\:kSame, which pads so : +: : : as to have the same output shape : +: : : as input if the stride is 1, or : +: : : Padding\:\:kValid, which uses no : +: : : no padding and "stops" the : +: : : window once it no longer fits) : Below code and figure shows an example of using `ReduceWindow`. Input is a matrix of size [4x6] and both window_dimensions and window_stride_dimensions are @@ -2027,6 +2031,17 @@ padding. +For a non-trivial padding example, consider computing reduce-window minimum +(initial value is `MAX_FLOAT`) with dimension `3` and stride `2` over the input +array `[10000, 1000, 100, 10, 1]`. Padding `kValid` computes minimums over two +_valid_ windows: `[10000, 1000, 100]` and `[100, 10, 1]`, resulting in the +output `[100, 1]`. Padding `kSame` first pads the array so that the shape after +the reduce-window would be the _same_ as input for stride one by adding initial +elements on both sides, getting `[MAX_VALUE, 10000, 1000, 100, 10, 1, +MAX_VALUE]`. Running reduce-window over the padded array operates on three +windows `[MAX_VALUE, 10000, 1000]`, `[1000, 100, 10]`, `[10, 1, MAX_VALUE]`, and +yields `[1000, 10, 1]`. + The evaluation order of the reduction function is arbitrary and may be non-deterministic. Therefore, the reduction function should not be overly sensitive to reassociation. See the discussion about associativity in the diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc index c5e1dbe7432..ff8adb0c460 100644 --- a/tensorflow/compiler/xla/tests/reduce_window_test.cc +++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc @@ -142,6 +142,15 @@ XLA_TEST_P(ReduceWindowTest, Min3In5Stride2) { {}, ErrorSpec(0.00001)); } +XLA_TEST_P(ReduceWindowTest, Min3In5Stride2Same) { + const auto input = CreateConstantFromLiteral( + LiteralUtil::CreateR1({10000, 1000, 100, 10, 1}), &builder_); + ReduceWindowMin(input, {3}, {2}, Padding::kSame); + ComputeAndCompareLiteral(&builder_, + LiteralUtil::CreateR1({1000, 10, 1}), {}, + ErrorSpec(0.00001)); +} + XLA_TEST_P(ReduceWindowTest, Min3In5Stride1WithSamePadding) { const auto input = CreateConstantFromLiteral( LiteralUtil::CreateR1({10000, 1000, 100, 10, 1}), &builder_); From a96ca65de07052dc60f48cb79151a0ee806f76b4 Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Mon, 22 Jul 2019 11:37:06 -0700 Subject: [PATCH 0309/3053] minor changes --- tensorflow/core/kernels/cudnn_rnn_ops.cc | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc index bd282d815bf..55e8bc134bc 100644 --- a/tensorflow/core/kernels/cudnn_rnn_ops.cc +++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc @@ -1468,15 +1468,8 @@ class CudnnRNNForwardOp : public CudnnRNNKernelCommon { context, model_types(), time_major, &input, &input_h, &input_c, ¶ms, &sequence_lengths, num_proj, &model_shapes)); - auto seq_array = sequence_lengths->template flat().data(); - bool all_max_seq_length = true; - for (int i = 0; i < model_shapes.batch_size; i++) { - if (seq_array[i] != model_shapes.max_seq_length) { - all_max_seq_length = false; - break; - } - } - use_padded_io = !(time_major && all_max_seq_length); + use_padded_io = ShouldUsePaddedIO(sequence_lengths, model_shapes, + time_major); } else { OP_REQUIRES_OK(context, ExtractForwardInput(context, model_types(), time_major, From 07b46f87b38378de414f04a73bdc606f8f3a5967 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Mon, 22 Jul 2019 10:30:04 -0700 Subject: [PATCH 0310/3053] [XLA] Add test for BackendConfigs which contain nan/inf. PiperOrigin-RevId: 259356278 --- tensorflow/compiler/xla/service/BUILD | 2 ++ .../xla/service/hlo_instruction_test.cc | 23 +++++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index ce4c501ff07..f34572bd2a4 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -565,8 +565,10 @@ tf_cc_test( "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:window_util", + "//tensorflow/compiler/xla/service/gpu:backend_configs", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", + "//tensorflow/core:test", "@com_google_absl//absl/container:flat_hash_map", ], ) diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc index 80de1d5e0bc..f06a7720dbc 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/protobuf_util.h" #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" +#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h" #include "tensorflow/compiler/xla/service/hlo_casting_utils.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instructions.h" @@ -34,6 +35,7 @@ limitations under the License. #include "tensorflow/compiler/xla/tests/hlo_test_base.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/window_util.h" +#include "tensorflow/core/lib/core/status_test_util.h" namespace xla { namespace { @@ -1956,5 +1958,26 @@ TEST_F(HloInstructionTest, GatherDoesNotReuseElements) { EXPECT_FALSE(root->ReusesOperandElements(1)); } +TEST_F(HloInstructionTest, BackendConfigCanContainNonFiniteFloats) { + HloComputation::Builder b(TestName()); + Shape shape = ShapeUtil::MakeShape(F32, {2, 2}); + auto p0 = b.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0")); + DotDimensionNumbers dot_dnums; + dot_dnums.add_lhs_contracting_dimensions(1); + dot_dnums.add_rhs_contracting_dimensions(0); + auto dot = b.AddInstruction(HloInstruction::CreateDot( + shape, p0, p0, dot_dnums, DefaultPrecisionConfig(2))); + + gpu::GemmBackendConfig orig_config; + orig_config.set_alpha_real(std::numeric_limits::infinity()); + orig_config.set_alpha_imag(std::numeric_limits::quiet_NaN()); + TF_ASSERT_OK(dot->set_backend_config(orig_config)); + + TF_ASSERT_OK_AND_ASSIGN(auto new_config, + dot->backend_config()); + EXPECT_GT(new_config.alpha_real(), std::numeric_limits::max()); + EXPECT_NE(new_config.alpha_imag(), new_config.alpha_imag()); +} + } // namespace } // namespace xla From 507b688b9c19fac4bf849c13f46c13236202c210 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Mon, 22 Jul 2019 10:33:46 -0700 Subject: [PATCH 0311/3053] Print none instead of invoking UB in cuda_conv_runner PiperOrigin-RevId: 259357156 --- tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc index c2817e36466..5aa76ac0140 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc @@ -158,9 +158,11 @@ Status RunCudnnConvImpl(const CudnnConvParams& params, if (!stream->ok()) { return InternalError( - "Unable to launch convolution with type %s and algorithm (%d, %d)", + "Unable to launch convolution with type %s and algorithm (%d, %s)", CudnnConvKindToString(params.kind), algorithm.algorithm()->algo_id(), - algorithm.algorithm_no_scratch()->algo_id()); + algorithm.algorithm_no_scratch().has_value() + ? absl::StrCat(algorithm.algorithm_no_scratch()->algo_id()) + : "none"); } return Status::OK(); } From 808a8068ad9a206d979d34b33357dd92f21ba786 Mon Sep 17 00:00:00 2001 From: amoitra Date: Mon, 22 Jul 2019 11:51:18 -0700 Subject: [PATCH 0312/3053] Incorporate Adrian's comments --- tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) mode change 100644 => 100755 tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc old mode 100644 new mode 100755 index a441e70510a..9c859a00dbc --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc @@ -155,8 +155,8 @@ MatchBackwardFilter(HloInstruction* conv) { } auto rhs_in = conv->mutable_operand(1)->shape().dimensions(kernel_input_feature_dim); - if ((conv->feature_group_count() > 1) && (rhs_in == 1) && - (input_batch_dim == output_batch_dim)) { + if (conv->feature_group_count() > 1 && rhs_in == 1 && + input_batch_dim == output_batch_dim) { VLOG(1) << conv->ToString() << " is a depthwise forward convolution. No need to fold to " "backward filter."; @@ -270,8 +270,7 @@ MatchBackwardFilter(HloInstruction* conv) { // Reshape batch_dim G*N -> [G,N] std::vector reshape_dims = lhs->shape().dimensions(); auto num_groups = conv->feature_group_count(); - // Ensure that input_batch is exact multiple of conv->feature_group_count() - CHECK_EQ(input_batch % conv->feature_group_count(), 0) + CHECK_EQ(input_batch % num_groups, 0) << "Input batch should be an exact multiple of feature group count"; reshape_dims[input_batch_dimension] = reshape_dims[input_batch_dimension] / num_groups; From 969a4b05b4b7bbda14c4b4b44a94137220340bb7 Mon Sep 17 00:00:00 2001 From: amoitra Date: Mon, 22 Jul 2019 11:58:33 -0700 Subject: [PATCH 0313/3053] minor edit --- tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc index 9c859a00dbc..33486608c1c 100755 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc @@ -328,7 +328,7 @@ MatchBackwardInput(HloInstruction* conv) { if (conv->feature_group_count() > 1) { return no_match_result; } - + // Match instruction pattern. CHECK_EQ(HloOpcode::kConvolution, conv->opcode()); HloInstruction* reverse_filter = conv->mutable_operand(1); From d73da78c4cbb6cce4378df9d52c5104d5e7c38b6 Mon Sep 17 00:00:00 2001 From: Justin Lebar Date: Mon, 22 Jul 2019 10:57:34 -0700 Subject: [PATCH 0314/3053] [XLA:GPU] Simplify the calling convention for custom-calls with tuple inputs/outputs. Previously it was up to the implementation to walk the input tuples and set the output tuple. Now this is handled by XLA, and you as the custom-call implementer just need to worry about the tuple leaf nodes. IOW this implements implicit tuple flattening for custom-calls. Note this is a breaking API/ABI change for people who use GPU custom-calls. We did warn you. :) PiperOrigin-RevId: 259362416 --- tensorflow/compiler/xla/g3doc/custom_call.md | 139 ++---------------- .../xla/service/gpu/custom_call_test.cc | 110 +++++++------- .../xla/service/gpu/custom_call_thunk.cc | 118 +++++++++++++-- tensorflow/compiler/xla/shape_util.h | 11 ++ 4 files changed, 182 insertions(+), 196 deletions(-) diff --git a/tensorflow/compiler/xla/g3doc/custom_call.md b/tensorflow/compiler/xla/g3doc/custom_call.md index acc2c9a92f5..7837f0aefaf 100644 --- a/tensorflow/compiler/xla/g3doc/custom_call.md +++ b/tensorflow/compiler/xla/g3doc/custom_call.md @@ -128,8 +128,8 @@ using xla::ShapeUtil; Shape p0_shape = ShapeUtil::MakeTuple({ ShapeUtil::MakeShape(F32, {32}), ShapeUtil::MakeTuple({ - ShapeUtil::MakeTuple(F32, {64}), - ShapeUtil::MakeTuple(F32, {128}), + ShapeUtil::MakeShape(F32, {64}), + ShapeUtil::MakeShape(F32, {128}), }), ShapeUtil::MakeShape(F32, {256}), }); @@ -197,133 +197,18 @@ subbuffers of `output_tuple` are accessible by dereferencing `out`. ### Tuples in GPU custom-calls In GPU code, we have a function `do_custom_call(..., void** buffers, ...)`. In -this case `buffers` is a host array of *nine* device pointers, one for each -nested buffer. To generate the flat list, we iterate over the parameters and -output, and then do preorder traversal of their shapes. Concretely: +this case `buffers` is a host array of *six* device pointers, one for each leaf +buffer in the input/output. To generate the flat list, we iterate over the +parameters and output, and for each we do a preorder traversal of its shape. +Concretely: ```c++ // Layout of `buffers` parameter to GPU custom call function for custom-call // above. -buffers[0] == param0 -buffers[1] == subbuf0 or null -buffers[2] == subtuple or null -buffers[3] == subbuf1 or null -buffers[4] == subbuf2 or null -buffers[5] == subbuf3 or null -buffers[6] == output_tuple -buffers[7] == output_subbuf0 -buffers[8] == output_subbuf1 +buffers[0] == subbuf0 +buffers[1] == subbuf1 +buffers[2] == subbuf2 +buffers[3] == subbuf3 +buffers[4] == output_subbuf0 +buffers[5] == output_subbuf1 ``` - -The `or null` part is significant. A sub-buffer of an input tuple will be -non-null in the `buffers` list if XLA is able to statically analyze the program -and figure out the address of the sub-buffer. This is usually the case, but may -not be in programs with control flow and/or `select` ops over tuples. - -A correct custom-call implementation that accepts a tuple as input must always -handle null input sub-buffers, by dereferencing the root tuple. - -The rule is reversed for output buffers. The output sub-buffers will always be -populated, but it's up to the custom call to populate the root tuple at the end. - -See the following code. Note that we leave out CUDA error handling for clarity, -but you'll be thankful if you do it, because otherwise it can be hard to tell -when a stream encounters an error. - -```c++ -void do_custom_call(CUstream stream, void** buffers, const char* opaque, - size_t opaque_len) { - bool needs_sync = false; - const float* subbuf0 = reinterpret_cast(buffers[1]); - if (subbuf0 == nullptr) { - needs_sync = true; - cudaMemcpyAsync(&subbuf0, buffers[0], sizeof(void*), - cudaMemcpyDeviceToHost, stream); - } - const void** subtuple = reinterpret_cast(buffers[2]); - if (subtuple == nullptr) { - needs_sync = true; - cudaMemcpyAsync(&subtuple, buffers[2], ...); - } - - // ... similarly for other params ... - - // Wait for copies enqueued above to complete. - if (needs_sync) { - cudaStreamSynchronize(stream); - } - needs_sync = false; - - // Now that we have `subtuple`, we can get subbuf1 and subbuf2. - float* subbuf1 = buffers[3]; - if (subbuf1 == nullptr) { - needs_sync = true; - cudaMemcpyAsync(&subbuf1, subtuple, ...); - } - float* subbuf2 = buffers[4]; - if (subbuf2 == nullptr) { - needs_sync = true; - cudaMemcpyAsync(&subbuf2, subtuple + 1, ...); - } - - // Wait for copies enqueued above to complete. - if (needs_sync) { - cudaStreamSynchronize(stream); - } - - // ... actually run the kernel ... - - // Fill the output tuple. - void* outputs[2] = {buffers[7], buffers[8]}; - cudaMemcpyAsync(buffers[6], outputs, sizeof(outputs), cudaMemcpyHostToDevice, - stream); - - // Necessary to force the cudaMemcpyAsync above to complete before `outputs` - // goes out of scope. A sync is only necessary in the tuple output case, and - // see below for a way to avoid this. - cudaStreamSynchronize(stream); -} -``` - -The `cudaStreamSynchronize` at the end of the function is unfortunate, as it's -not required in the non-tuple-output case, and it can be expensive. One way to -get around this would be to make `outputs` into a global variable and ensure -that the previous cudaMemcpyAsync completed before overwriting the global and -enqueueing another one. This is sketched below. - -``` -void do_custom_call(CUstream stream, void** buffers, const char* opaque, - size_t opaque_len) { - - // ... Beginning of function is the same as above ... - - // ... actually run the kernel ... - - static std::atomic first_time{true}; - static CUevent event; - static void* outputs[2]; - if (first_time.fetch_and(false)) { - // First time running this function. Initialize `event`. - cuEventCreate(&event, CU_EVENT_DISABLE_TIMING); - } else { - // Not first time running this function. Wait for previous event to - // complete before touching `outputs`. - cuEventSynchronize(event); - } - - // Fill the output tuple. - outputs[0] = buffers[7]; - outputs[1] = buffers[8]; - cudaMemcpyAsync(buffers[6], outputs, sizeof(outputs), cudaMemcpyHostToDevice, - stream); - - // Unblock `event` after the memcpy completes. - cuEventRecord(event, stream); -} -``` - -This simple implementation would limit parallelism if you want to run this op on -multiple GPUs concurrently (or on one GPU with multiple streams); in that case -you might need multiple events and globals. We have seen one implementation of -this algorithm which keeps a pool of globals and events and periodically polls -them (perhaps on each call to the op) to garbage collect. diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_test.cc b/tensorflow/compiler/xla/service/gpu/custom_call_test.cc index c04f6fb7bf5..53a3ca14400 100644 --- a/tensorflow/compiler/xla/service/gpu/custom_call_test.cc +++ b/tensorflow/compiler/xla/service/gpu/custom_call_test.cc @@ -90,67 +90,25 @@ void Callback_SubBuffers(CUstream stream, void** buffers, const char* /*opaque*/, size_t /*opaque_len*/) { // `buffers` is a flat array containing device pointers to the following. // - // 0: root tuple of param 0 - // 1: param 0 at tuple index {0}, shape f32[128] - // 2: param 0 at tuple index {1}, shape f32[256] - // 3: root tuple of param 1 - // 4: param 1 at tuple index {0}, shape f32[1024] - // 5: param 1 at tuple index {1}, shape f32[8] - // 6: root tuple of custom-call result - // 7: result at tuple index {0}, shape f32[8] - // 8: result at tuple index {1}, shape (f32[128], f32[256]) - // 9: result at tuple index {1, 0}, shape f32[128] - // 10: result at tuple index {1, 1}, shape f32[256] - // 11: result at tuple index {2}, shape f32[1024] + // 0: param 0 at tuple index {0}, shape f32[128] + // 1: param 0 at tuple index {1}, shape f32[256] + // 2: param 1 at tuple index {0}, shape f32[1024] + // 3: param 1 at tuple index {1}, shape f32[8] + // 4: result at tuple index {0}, shape f32[8] + // 5: result at tuple index {1, 0}, shape f32[128] + // 6: result at tuple index {1, 1}, shape f32[256] + // 7: result at tuple index {2}, shape f32[1024] // - // It's the contract of custom-call that the non-root pointers (i.e. - // everything other than indices 0, 3, and 6) may be null, if XLA is unable to - // analyze the program well enough to determine for sure what's in those - // buffers. For this simple example, all of the buffers should be non-null. - // Check the param 0 tuple, namely that - // - // (*buffers[0])[0] == buffers[1] and - // (*buffers[0])[1] == buffers[2]. - // - // because buffers contains pointers to device memory, we have to retrieve - // these values via cudaMemcpy. - void* p0[2]; - cudaMemcpy(p0, buffers[0], 2 * sizeof(void*), cudaMemcpyDeviceToHost); - ASSERT_EQ(p0[0], buffers[1]); - ASSERT_EQ(p0[1], buffers[2]); - - // Check the param 1 tuple, namely that - // - // (*buffers[3])[0] == buffers[4] - // (*buffers[3])[1] == buffers[5]. - void* p1[2]; - cudaMemcpy(p1, buffers[3], 2 * sizeof(void*), cudaMemcpyDeviceToHost); - ASSERT_EQ(p1[0], buffers[4]); - ASSERT_EQ(p1[1], buffers[5]); - - // We don't have an equivalent check for the output tuple (i.e. we don't check - // (*buffers[6])[0] == buffers[7]) because it's up to us to set the tuple - // as part of this custom-call. - - // Write the results. First set the root tuple output buffer to {b7, b8, - // b11}. - void* root[3] = {buffers[7], buffers[8], buffers[11]}; - cudaMemcpy(buffers[6], root, 3 * sizeof(void*), cudaMemcpyHostToDevice); - - // Now set the sub-tuple output buffer at index 8 to {b9, b10}. - void* sub_tuple[2] = {buffers[9], buffers[10]}; - cudaMemcpy(buffers[8], sub_tuple, 2 * sizeof(void*), cudaMemcpyDeviceToHost); - - // Now set output leaf buffers 7, 9, 10, and 11, copying data from the - // corresponding same-sized inputs. - cudaMemcpyAsync(buffers[7], buffers[5], 8 * sizeof(float), + // Set output leaf buffers, copying data from the corresponding same-sized + // inputs. + cudaMemcpyAsync(buffers[4], buffers[3], 8 * sizeof(float), cudaMemcpyDeviceToDevice, stream); - cudaMemcpyAsync(buffers[9], buffers[1], 128 * sizeof(float), + cudaMemcpyAsync(buffers[5], buffers[0], 128 * sizeof(float), cudaMemcpyDeviceToDevice, stream); - cudaMemcpyAsync(buffers[10], buffers[2], 256 * sizeof(float), + cudaMemcpyAsync(buffers[6], buffers[1], 256 * sizeof(float), cudaMemcpyDeviceToDevice, stream); - cudaMemcpyAsync(buffers[11], buffers[4], 1024 * sizeof(float), + cudaMemcpyAsync(buffers[7], buffers[2], 1024 * sizeof(float), cudaMemcpyDeviceToDevice, stream); } XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_SubBuffers, "CUDA"); @@ -185,5 +143,45 @@ TEST_F(CustomCallTest, SubBuffers) { EXPECT_THAT(result.data({2}), ::testing::Each(3)); } +void Callback_TupleSelect(CUstream stream, void** buffers, + const char* /*opaque*/, size_t /*opaque_len*/) { + // Set the two output leaf buffers equal to the two input leaf buffers. + cudaMemcpyAsync(buffers[2], buffers[0], 10 * sizeof(float), + cudaMemcpyDeviceToDevice, stream); + cudaMemcpyAsync(buffers[3], buffers[1], 10 * sizeof(float), + cudaMemcpyDeviceToDevice, stream); +} +XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_TupleSelect, "CUDA"); +// Tuple-shaped select is a case where XLA can't know all buffer assignments +// statically ahead of time and has to walk the on-device tuple sub-buffers. +TEST_F(CustomCallTest, TupleSelect) { + XlaBuilder b(TestName()); + auto tuple_shape = ShapeUtil::MakeTupleShape({ + ShapeUtil::MakeShape(F32, {10}), + ShapeUtil::MakeShape(F32, {10}), + }); + auto p0 = AddParam(LiteralUtil::CreateR0(false), &b); + auto p1 = + AddParam(LiteralUtil::MakeTupleOwned( + LiteralUtil::CreateR1(std::vector(10, 1.0f)), + LiteralUtil::CreateR1(std::vector(10, 2.0f))), + &b); + auto p2 = + AddParam(LiteralUtil::MakeTupleOwned( + LiteralUtil::CreateR1(std::vector(10, 10.0f)), + LiteralUtil::CreateR1(std::vector(10, 20.0f))), + &b); + auto cc = CustomCall(&b, "Callback_TupleSelect", + /*operands=*/{Select(p0, p1, p2)}, tuple_shape, + /*opaque=*/""); + + // Do a tuple-select on the custom-call result to ensure that the custom-call + // sets its output tuple index buffers. + Select(p0, p1, cc); + TF_ASSERT_OK_AND_ASSIGN(auto result, ComputeAndTransfer(&b, {})); + EXPECT_THAT(result.data({0}), ::testing::Each(10)); + EXPECT_THAT(result.data({1}), ::testing::Each(20)); +} + } // anonymous namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc index 5fba64e90ed..65673106391 100644 --- a/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/custom_call_thunk.cc @@ -48,8 +48,83 @@ CustomCallThunk::CustomCallThunk( instr->shape().ToString(), result_slices.shape().ToString()); } +// For each leaf in a preorder traversal of `slices`, appends its device address +// to `buffers`. +// +// In the common case, this is trivial; simply iterate over the ShapeTree and +// add every leaf to `buffers`. But under some circumstances XLA doesn't +// statically know the address of a leaf buffer and has to derive it by walking +// the on-device tuple. +static Status AppendBuffersFor(const ShapeTree& slices, + const BufferAllocations* buffer_allocations, + se::Stream* stream, + std::vector* buffers) { + // Buffer addresses we've retrieved by following device tuples. + ShapeTree retrieved_addrs(slices.shape()); + + // We make this lambda an std::function so it can capture itself. + std::function(const ShapeIndexView&)> get_addr_for = + [&](ShapeIndexView index) -> StatusOr { + auto slice = slices.element(index); + + // If we know the address of this sub-buffer statically, return it. + if (slice.allocation() != nullptr) { + return buffer_allocations->GetDeviceAddress(slice).opaque(); + } + // If we've already pulled the address for this sub-buffer down from the + // GPU, return it. + if (retrieved_addrs.element(index) != nullptr) { + return retrieved_addrs.element(index); + } + + // Recurse to get the address of the parent sub-buffer. + CHECK(!index.empty()) << "Address of tuple root cannot be unknown!"; + TF_ASSIGN_OR_RETURN(void* parent_buffer, get_addr_for(index.ConsumeBack())); + + // Pull down the entirety of parent_buffer from the GPU, getting the address + // we're interested in plus all of its siblings. (Perhaps only some of the + // siblings are unknown and we could get away without retrieving all of + // them. But in practice, getting them all in one fell swoop should be just + // as fast as getting just one.) + // + // TODO(jlebar): This is not as efficient as possible. In particular, at + // the expense of some complexity we could batch up multiple parallel D2H + // copies (say for multiple unrelated sub-buffers, maybe even across + // different parameters) and do just one BlockHostUntilDone. Hopefully the + // case when we have to do any copies at all is uncommon. + int64 num_siblings = + ShapeUtil::GetSubshape(slices.shape(), index.ConsumeBack()) + .tuple_shapes_size(); + std::vector sibling_addrs(num_siblings); + TF_RETURN_IF_ERROR( + stream + ->ThenMemcpy(sibling_addrs.data(), + se::DeviceMemoryBase(parent_buffer, sizeof(void*)), + num_siblings * sizeof(void*)) + .BlockHostUntilDone()); + + // Save the data we retrieved into retrieved_addrs. + for (int64 i = 0; i < num_siblings; ++i) { + ShapeIndex sibling_index(index.ConsumeBack()); + sibling_index.push_back(i); + *retrieved_addrs.mutable_element(sibling_index) = sibling_addrs[i]; + } + return sibling_addrs[index.back()]; + }; + + return slices.ForEachElementWithStatus( + [&](const ShapeIndex& index, const BufferAllocation::Slice&) { + if (slices.IsLeaf(index)) { + TF_ASSIGN_OR_RETURN(void* addr, get_addr_for(index)); + buffers->push_back(addr); + } + return Status::OK(); + }); +} + Status CustomCallThunk::ExecuteOnStream(const ExecuteParams& params) { // gpu_stream is CUstream or e.g. the equivalent type in ROCm. + se::Stream* stream = params.stream; auto gpu_stream = se::gpu::AsGpuStreamValue(params.stream); auto typed_call_target = reinterpret_cast buffers; - auto append_buffers = [&](const ShapeTree& slices) { - slices.ForEachElement([&](const ShapeIndex& /*index*/, - const BufferAllocation::Slice& slice) { - if (slice.allocation() == nullptr) { - buffers.push_back(nullptr); - } - buffers.push_back( - params.buffer_allocations->GetDeviceAddress(slice).opaque()); - }); - }; for (const auto& slices : operand_slices_) { - append_buffers(slices); + TF_RETURN_IF_ERROR( + AppendBuffersFor(slices, params.buffer_allocations, stream, &buffers)); } - append_buffers(result_slices_); + TF_RETURN_IF_ERROR(AppendBuffersFor(result_slices_, params.buffer_allocations, + stream, &buffers)); typed_call_target(gpu_stream, buffers.data(), opaque_.data(), opaque_.size()); - return Status::OK(); + + // If the custom-call returns a tuple, populate the result tuple index + // buffers. + return result_slices_.ForEachElementWithStatus( + [&](const ShapeIndex& index, const BufferAllocation::Slice& slice) { + const Shape& subshape = + ShapeUtil::GetSubshape(result_slices_.shape(), index); + auto n = subshape.tuple_shapes_size(); + if (!subshape.IsTuple() || n == 0) { + return Status::OK(); + } + auto tuple_ptrs = absl::make_unique(n); + ShapeIndex subindex(index); + for (int i = 0; i < n; ++i) { + subindex.push_back(i); + tuple_ptrs[i] = + params.buffer_allocations + ->GetDeviceAddress(result_slices_.element(subindex)) + .opaque(); + subindex.pop_back(); + } + SafeH2DMemcpy(se::DeviceMemory( + params.buffer_allocations->GetDeviceAddress(slice)), + std::move(tuple_ptrs), n, stream); + return Status::OK(); + }); } } // namespace gpu diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h index ebb56746518..e2d74627c60 100644 --- a/tensorflow/compiler/xla/shape_util.h +++ b/tensorflow/compiler/xla/shape_util.h @@ -43,6 +43,8 @@ limitations under the License. namespace xla { +class ShapeIndexView; + // An index for specifying a particular nested subshape within a shape. Used in // ShapeUtil::GetSubshape and other interfaces. Shapes are recursive data // structures (trees) and ShapeIndex defines a path through the tree where each @@ -69,6 +71,8 @@ class ShapeIndex { template ShapeIndex(InputIt start, InputIt end) : indices_(start, end) {} + explicit ShapeIndex(ShapeIndexView v); + bool empty() const { return indices_.empty(); } size_t size() const { return indices_.size(); } void push_back(int64 value) { indices_.push_back(value); } @@ -137,6 +141,10 @@ class ShapeIndexView { CHECK(!empty()); return indices_.front(); } + int64 back() const { + CHECK(!empty()); + return indices_.back(); + } ShapeIndexView ConsumeFront() const { ShapeIndexView result = *this; result.indices_.remove_prefix(1); @@ -161,6 +169,9 @@ class ShapeIndexView { absl::Span indices_; }; +inline ShapeIndex::ShapeIndex(ShapeIndexView v) + : ShapeIndex(v.begin(), v.end()) {} + std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index); std::ostream& operator<<(std::ostream& out, const ShapeIndexView& shape_index); From f72f8bef216ed2712f1e69468e375c901b061ace Mon Sep 17 00:00:00 2001 From: Oscar Ramirez Date: Mon, 22 Jul 2019 11:17:25 -0700 Subject: [PATCH 0315/3053] Remove left over debug loggging. PiperOrigin-RevId: 259367050 --- tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc index ebf704c0718..a8b57eee37a 100644 --- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc @@ -2759,7 +2759,6 @@ class ConvertExpm1Stage : public ArithmeticOptimizerStage { // input data type is not supported by expm1. Skip. return Status::OK(); } - LOG(INFO) << "Got element = " << element; if (element != complex128(1)) { // current element is not 1. Skip. return Status::OK(); From a9fee430223d70c684b359aebadce33953296d68 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Wed, 26 Jun 2019 10:43:10 -0700 Subject: [PATCH 0316/3053] Add the XLA_FLAGS xla_gpu_ptx_code to allow specifing the PTX code to use. --- .../compiler/xla/debug_options_flags.cc | 14 +++++++++ tensorflow/compiler/xla/service/dump.cc | 8 ++--- tensorflow/compiler/xla/service/dump.h | 3 ++ .../xla/service/gpu/nvptx_compiler.cc | 31 ++++++++++++++++++- tensorflow/compiler/xla/xla.proto | 9 ++++-- 5 files changed, 57 insertions(+), 8 deletions(-) diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc index 45f9cbe4ce8..920d1d1e2c5 100644 --- a/tensorflow/compiler/xla/debug_options_flags.cc +++ b/tensorflow/compiler/xla/debug_options_flags.cc @@ -149,6 +149,12 @@ static void AllocateFlags() { return true; }; + // Custom "sub-parser" lambda for xla_gpu_ptx_code + auto setter_for_xla_gpu_ptx_code = [](string value) { + flag_values->add_xla_gpu_ptx_code(value); + return true; + }; + // Custom "sub-parser" lambda for xla_backend_extra_options. auto setter_for_xla_backend_extra_options = [](string comma_separated_values) { @@ -342,6 +348,14 @@ static void AllocateFlags() { int32_setter_for(&DebugOptions::set_xla_gpu_max_kernel_unroll_factor), flag_values->xla_gpu_max_kernel_unroll_factor(), "Specify the maximum kernel unroll factor for the GPU backend."), + tensorflow::Flag("xla_gpu_ptx_code", + setter_for_xla_gpu_ptx_code, "", + "If non-empty, speficies a file containing ptx to use." + "The filename prefix must have the same pattern as PTX dumped by XLA. " + "This allows to match one specific module." + "General workflow. Get the " + "generated module ptx from XLA. Modify it. Then pass it " + "back via this option."), tensorflow::Flag( "xla_test_all_output_layouts", bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts), diff --git a/tensorflow/compiler/xla/service/dump.cc b/tensorflow/compiler/xla/service/dump.cc index 6a4837211e8..331c935bdc9 100644 --- a/tensorflow/compiler/xla/service/dump.cc +++ b/tensorflow/compiler/xla/service/dump.cc @@ -136,10 +136,6 @@ struct CanonicalDebugOptions { bool dump_snapshots; }; -string FilenameFor(const HloModule& module, string_view suffix) { - return StrFormat("module_%04d.%s", module.unique_id(), suffix); -} - void DumpToFileInDirImpl(string_view filename, string_view contents, const CanonicalDebugOptions& opts) { if (opts.dumping_to_stdout()) { @@ -263,6 +259,10 @@ static auto& module_id_to_step_number GUARDED_BY(mu) = } // namespace +string FilenameFor(const HloModule& module, string_view suffix) { + return StrFormat("module_%04d.%s", module.unique_id(), suffix); +} + void DumpToFileInDir(const HloModule& module, string_view suffix, string_view contents) { DumpToFileInDirImpl(FilenameFor(module, suffix), contents, diff --git a/tensorflow/compiler/xla/service/dump.h b/tensorflow/compiler/xla/service/dump.h index 6edc9b28dde..d245ad582c4 100644 --- a/tensorflow/compiler/xla/service/dump.h +++ b/tensorflow/compiler/xla/service/dump.h @@ -33,6 +33,9 @@ class BufferAssignment; class HloExecutionProfile; class HloSnapshot; +// Create the filename we will use to dump in DumpToFileInDir. +string FilenameFor(const HloModule& module, absl::string_view suffix); + // Writes the given string to a file in the xla_dump_to directory specified by // module's DebugOptions. // diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc index 20b3d64c417..3fbd5735af1 100644 --- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include #include #include // NOLINT(build/c++11): only using std::call_once, not mutex. #include @@ -626,7 +627,35 @@ StatusOr> NVPTXCompiler::RunBackend( } string ptx; - { + + // Generate the PTX or load it if provided. + // If the xla_gpu_ptx_code options is set, be explicit when a file is used + // and warn when a file is not used to ease catching typo in filename. + string prefix = FilenameFor(*module, ptx); + string ptx_filename; + for (const string filename : module->config().debug_options().xla_gpu_ptx_code()) { + // To ease comparing many PTX versions, accept different suffix then + // the original filename. + if(absl::StartsWith(filename, prefix)) { + ptx_filename = filename; + VLOG(0) << "RunBackend() - Will load PTX from file: " << filename; + break; + } else { + VLOG(0) << "RunBackend() - For module with prefix '" << prefix + << "', we skip PTX code file: " << filename; + } + } + if (module->config().debug_options().xla_gpu_ptx_code().size() > 0 && + ptx_filename.size() == 0) { + VLOG(0) << "RunBackend() - For module with prefix '" << prefix + << "', we did not found a PTX file to load."; + } + if(!ptx_filename.empty()) { + std::ifstream ifs(ptx_filename, std::ifstream::in); + ptx = std::string(std::istreambuf_iterator(ifs), + std::istreambuf_iterator()); + CHECK(ptx.size() > 0) << "Empty or non existing PTX file: " << ptx_filename; + } else { XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend - CompileToPtx"); TF_ASSIGN_OR_RETURN(ptx, CompileToPtx(&llvm_module, {cc_major, cc_minor}, module->config(), libdevice_dir)); diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto index 7a40e4096de..6c401d8e4ab 100644 --- a/tensorflow/compiler/xla/xla.proto +++ b/tensorflow/compiler/xla/xla.proto @@ -276,13 +276,16 @@ message DebugOptions { // directory. bool xla_dump_hlo_snapshots = 118; + bool xla_gpu_force_conv_nchw = 125; + + // Path to a file with ptx code. + repeated string xla_gpu_ptx_code = 127; + // // END flags controlling dumping HLO modules. // - bool xla_gpu_force_conv_nchw = 125; - - // Next id: 127 + // Next id: 128 // Extra options to pass to the compilation backend (e.g. LLVM); specific // interpretation of these values is left to the backend. From c705cba9e926f4c76ed688a5bf84f6e3c41fc702 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 18 Jul 2019 09:27:45 -0700 Subject: [PATCH 0317/3053] Fix many of the comments. --- .../compiler/xla/debug_options_flags.cc | 22 +++++++++---------- .../xla/service/gpu/nvptx_compiler.cc | 21 ++++++++---------- tensorflow/compiler/xla/xla.proto | 10 ++++----- 3 files changed, 25 insertions(+), 28 deletions(-) diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc index 920d1d1e2c5..1680f58d751 100644 --- a/tensorflow/compiler/xla/debug_options_flags.cc +++ b/tensorflow/compiler/xla/debug_options_flags.cc @@ -149,9 +149,9 @@ static void AllocateFlags() { return true; }; - // Custom "sub-parser" lambda for xla_gpu_ptx_code - auto setter_for_xla_gpu_ptx_code = [](string value) { - flag_values->add_xla_gpu_ptx_code(value); + // Custom "sub-parser" lambda for xla_gpu_ptx_file. + auto setter_for_xla_gpu_ptx_file = [](string value) { + flag_values->add_xla_gpu_ptx_file(value); return true; }; @@ -348,14 +348,14 @@ static void AllocateFlags() { int32_setter_for(&DebugOptions::set_xla_gpu_max_kernel_unroll_factor), flag_values->xla_gpu_max_kernel_unroll_factor(), "Specify the maximum kernel unroll factor for the GPU backend."), - tensorflow::Flag("xla_gpu_ptx_code", - setter_for_xla_gpu_ptx_code, "", - "If non-empty, speficies a file containing ptx to use." - "The filename prefix must have the same pattern as PTX dumped by XLA. " - "This allows to match one specific module." - "General workflow. Get the " - "generated module ptx from XLA. Modify it. Then pass it " - "back via this option."), + tensorflow::Flag("xla_gpu_ptx_file", + setter_for_xla_gpu_ptx_file, "", + "If non-empty, speficies a file containing ptx to use. " + "The filename prefix must have the same pattern as PTX " + "dumped by XLA. This allows to match one specific " + "module. General workflow. Get the generated module " + "ptx from XLA. Modify it. Then pass it back via this " + "option."), tensorflow::Flag( "xla_test_all_output_layouts", bool_setter_for(&DebugOptions::set_xla_test_all_output_layouts), diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc index 3fbd5735af1..3ddacb2c3d9 100644 --- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc @@ -603,7 +603,7 @@ StatusOr> NVPTXCompiler::RunBackend( "Rerun with --xla_dump_to to get the IR. "; } - string libdevice_dir; + std::string libdevice_dir; { tensorflow::mutex_lock lock(mutex_); @@ -626,27 +626,24 @@ StatusOr> NVPTXCompiler::RunBackend( cc_minor = 0; } - string ptx; + std::string ptx; // Generate the PTX or load it if provided. - // If the xla_gpu_ptx_code options is set, be explicit when a file is used + // If the xla_gpu_ptx_file options is set, be explicit when a file is used // and warn when a file is not used to ease catching typo in filename. - string prefix = FilenameFor(*module, ptx); - string ptx_filename; - for (const string filename : module->config().debug_options().xla_gpu_ptx_code()) { + std::string prefix = FilenameFor(*module, ptx); + std::string ptx_filename; + for (const string filename : module->config().debug_options().xla_gpu_ptx_file()) { // To ease comparing many PTX versions, accept different suffix then // the original filename. if(absl::StartsWith(filename, prefix)) { ptx_filename = filename; VLOG(0) << "RunBackend() - Will load PTX from file: " << filename; break; - } else { - VLOG(0) << "RunBackend() - For module with prefix '" << prefix - << "', we skip PTX code file: " << filename; } } - if (module->config().debug_options().xla_gpu_ptx_code().size() > 0 && - ptx_filename.size() == 0) { + if (module->config().debug_options().xla_gpu_ptx_file().size() > 0 && + ptx_filename.empty()) { VLOG(0) << "RunBackend() - For module with prefix '" << prefix << "', we did not found a PTX file to load."; } @@ -654,7 +651,7 @@ StatusOr> NVPTXCompiler::RunBackend( std::ifstream ifs(ptx_filename, std::ifstream::in); ptx = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); - CHECK(ptx.size() > 0) << "Empty or non existing PTX file: " << ptx_filename; + CHECK(!ptx.empty()) << "Empty or non existing PTX file: " << ptx_filename; } else { XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend - CompileToPtx"); TF_ASSIGN_OR_RETURN(ptx, CompileToPtx(&llvm_module, {cc_major, cc_minor}, diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto index 6c401d8e4ab..138af1a833b 100644 --- a/tensorflow/compiler/xla/xla.proto +++ b/tensorflow/compiler/xla/xla.proto @@ -276,15 +276,15 @@ message DebugOptions { // directory. bool xla_dump_hlo_snapshots = 118; - bool xla_gpu_force_conv_nchw = 125; - - // Path to a file with ptx code. - repeated string xla_gpu_ptx_code = 127; - // // END flags controlling dumping HLO modules. // + bool xla_gpu_force_conv_nchw = 125; + + // Paths to files with ptx code. + repeated string xla_gpu_ptx_file = 127; + // Next id: 128 // Extra options to pass to the compilation backend (e.g. LLVM); specific From facf061d5cbacf8fef2e196bf79c8bd1a96ddb8b Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 22 Jul 2019 11:21:06 -0700 Subject: [PATCH 0318/3053] [Grappler] Do not validate side effects execution order for ops that we know are not required to run in program order Fix for https://github.com/tensorflow/tfjs/issues/1740 PiperOrigin-RevId: 259367822 --- .../grappler/optimizers/function_optimizer.cc | 49 +++++++++++++++---- .../python/framework/auto_control_deps.py | 2 + 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc index deb9abab08f..b4f5c36bb9c 100644 --- a/tensorflow/core/grappler/optimizers/function_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc @@ -816,6 +816,39 @@ bool MarkedForXlaCompilation(const Node* n) { return CheckStringAttr(n, kXlaClusterAttr); } +const bool IsExemptFromSideEffectsExecutionValidation(const string& op) { + static const auto* exemption = new absl::flat_hash_set({ + // LINT.IfChange + // Op types that should not run in program order, e.g. because they need + // to run asynchronously to avoid deadlock. + "CollectiveGather", + "CollectiveReduce", + "CollectiveBcastSend", + "CollectiveBcastRecv", + "NcclAllReduce", + + // Legacy random ops. + // See details in tensorflow/python/framework/auto_control_deps.py. + "RandomUniform", + "RandomUniformInt", + "RandomStandardNormal", + "ParameterizedTruncatedNormal", + "TruncatedNormal", + "RandomShuffle", + "Multinomial", + "RandomGamma", + "RandomGammaGrad", + "RandomPoisson", + "RandomPoissonV2", + // LINT.ThenChange(//tensorflow/python/framework/auto_control_deps.py) + + // ReadVariableOp marked as stateful because it consumes DT_RESOURCE, + // but it can't generate any observable side-effect. + "ReadVariableOp", + }); + return exemption->contains(op); +} + // Validates that all side effects inside function body will be executed after // function inlining. We do it by looking for a path from stateful ops, to one // of the output control sources. @@ -826,19 +859,15 @@ Status ValidateSideEffectsExecution( const FunctionBody& fbody, OutputControlSource output_control_source, bool has_outgoing_control_edges, bool validate_outgoing_control_edge = true) { - // ReadVariableOp marked as stateful because it consumes DT_RESOURCE, but it - // can't generate any observable side-effect. - static constexpr const char* const kReadVariableOp = "ReadVariableOp"; - // Find all nodes that can produce side effects in the function body graph. We // use 'is_stateful()' bit as an approximation of "has side effects" property. std::vector fbody_side_effects; - absl::c_copy_if(fbody.graph->nodes(), std::back_inserter(fbody_side_effects), - [](const Node* n) { - return n->op_def().is_stateful() && !n->IsArg() && - !n->IsRetval() && - n->type_string() != kReadVariableOp; - }); + absl::c_copy_if( + fbody.graph->nodes(), std::back_inserter(fbody_side_effects), + [](const Node* n) { + return n->op_def().is_stateful() && !n->IsArg() && !n->IsRetval() && + !IsExemptFromSideEffectsExecutionValidation(n->type_string()); + }); // When graph executed in TF-2.0 context with automatic control dependencies // tracking, absence of outgoing control edge indicates that no one is diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py index 1b45286bfe9..1c16d38cbda 100644 --- a/tensorflow/python/framework/auto_control_deps.py +++ b/tensorflow/python/framework/auto_control_deps.py @@ -30,6 +30,7 @@ from tensorflow.python.util import nest from tensorflow.python.util import object_identity from tensorflow.python.util import tf_decorator +# LINT.IfChange # Op types that should not run in program order, e.g. because they need to run # asynchronously to avoid deadlock. ASYNC_STATEFUL_OPS = [ @@ -85,6 +86,7 @@ LEGACY_RANDOM_OPS = [ "RandomPoisson", "RandomPoissonV2", ] +# LINT.ThenChange(//tensorflow/core/grappler/optimizers/function_optimizer.cc) _ALL_BLACKLISTED_OPS = set(ASYNC_STATEFUL_OPS) | set(LEGACY_RANDOM_OPS) From 7341359745f0308ca16092dc332ec64ca0cacdf5 Mon Sep 17 00:00:00 2001 From: Pavithra Vijay Date: Mon, 22 Jul 2019 11:43:49 -0700 Subject: [PATCH 0319/3053] Add check to see mismatch between input and output batch size in single execution code path. PiperOrigin-RevId: 259372653 --- .../python/keras/engine/training_arrays.py | 14 +++-- .../python/keras/engine/training_generator.py | 6 +- .../python/keras/engine/training_utils.py | 62 ++++++++++++++----- .../keras/engine/training_utils_test.py | 12 ++-- tensorflow/python/keras/engine/training_v2.py | 11 +++- 5 files changed, 72 insertions(+), 33 deletions(-) diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py index 941bfd6cb91..c6cc78680ad 100644 --- a/tensorflow/python/keras/engine/training_arrays.py +++ b/tensorflow/python/keras/engine/training_arrays.py @@ -239,11 +239,15 @@ def model_iteration(model, # Select aggregation method. if mode == ModeKeys.PREDICT: - aggregator = training_utils.OutputsAggregator(use_steps, - num_samples_or_steps) + aggregator = training_utils.OutputsAggregator( + use_steps, + num_samples=None if steps_per_epoch else num_samples_or_steps, + steps=steps_per_epoch) else: - aggregator = training_utils.MetricsAggregator(use_steps, - num_samples_or_steps) + aggregator = training_utils.MetricsAggregator( + use_steps, + num_samples=None if steps_per_epoch else num_samples_or_steps, + steps=steps_per_epoch) if model._compile_distribution: distributed_training_utils._copy_weights_to_distributed_model(model, mode) @@ -307,7 +311,7 @@ def model_iteration(model, % (steps_name, steps_per_epoch * epochs)) elif step > 0: steps_per_epoch = step - aggregator.num_samples_or_steps = steps_per_epoch + aggregator.steps = steps_per_epoch if mode == ModeKeys.TRAIN: progbar.params['steps'] = steps_per_epoch progbar.progbar.target = steps_per_epoch diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py index 51368098074..b033c98770f 100644 --- a/tensorflow/python/keras/engine/training_generator.py +++ b/tensorflow/python/keras/engine/training_generator.py @@ -182,9 +182,9 @@ def model_iteration(model, progbar.params['verbose'] = verbose if mode == ModeKeys.PREDICT: - aggregator = training_utils.OutputsAggregator(True, steps_per_epoch) + aggregator = training_utils.OutputsAggregator(True, steps=steps_per_epoch) else: - aggregator = training_utils.MetricsAggregator(True, steps_per_epoch) + aggregator = training_utils.MetricsAggregator(True, steps=steps_per_epoch) should_set_learning_phase = context.executing_eagerly() and model.run_eagerly if should_set_learning_phase: @@ -236,7 +236,7 @@ def model_iteration(model, % (steps_name, steps_per_epoch * epochs)) elif step > 0: steps_per_epoch = step - aggregator.num_samples_or_steps = steps_per_epoch + aggregator.steps = steps_per_epoch if mode == ModeKeys.TRAIN: progbar.params['steps'] = steps_per_epoch progbar.progbar.target = steps_per_epoch diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py index 6a3ea5a32c7..a652807b5ce 100644 --- a/tensorflow/python/keras/engine/training_utils.py +++ b/tensorflow/python/keras/engine/training_utils.py @@ -62,13 +62,18 @@ class Aggregator(object): Attributes: use_steps: Whether the loop is using `step` or `batch_size`. - num_samples_or_steps: Either `batch_size*num_batches` or `steps`. + num_samples: Total number of samples: `batch_size * num_batches`. + steps: Total number of steps. + batch_size: Batch size. It is used for validation checks between inputs and + outputs. results: What to return at the end of the aggregation loop. """ - def __init__(self, use_steps, num_samples_or_steps): + def __init__(self, use_steps, num_samples=None, steps=None, batch_size=None): self.use_steps = use_steps - self.num_samples_or_steps = num_samples_or_steps + self.num_samples = num_samples + self.steps = steps + self.batch_size = batch_size self.results = [] @abc.abstractmethod @@ -100,7 +105,21 @@ class Aggregator(object): class MetricsAggregator(Aggregator): - """Aggregator that calculates loss and metrics info.""" + """Aggregator that calculates loss and metrics info. + + Attributes: + use_steps: Whether the loop is using `step` or `batch_size`. + num_samples: Total number of samples: `batch_size*num_batches`. + steps: Total number of steps, ie number of times to iterate over a dataset + to cover all samples. + """ + + def __init__(self, use_steps, num_samples=None, steps=None): + super(MetricsAggregator, self).__init__( + use_steps=use_steps, + num_samples=num_samples, + steps=steps, + batch_size=None) def create(self, batch_outs): self.results = [0.] * len(batch_outs) @@ -117,7 +136,7 @@ class MetricsAggregator(Aggregator): def finalize(self): if not self.results: raise ValueError('Empty training data.') - self.results[0] /= self.num_samples_or_steps + self.results[0] /= (self.num_samples or self.steps) class ConcatAggregator(Aggregator): @@ -127,16 +146,25 @@ class ConcatAggregator(Aggregator): structure of tensor-likes. """ - def __init__(self): + def __init__(self, batch_size): self.composite = None super(ConcatAggregator, self).__init__( - use_steps=True, num_samples_or_steps=None) + use_steps=True, num_samples=None, steps=None, batch_size=batch_size) def create(self, batch_element): self.composite = composite_tensor_utils.is_composite_or_composite_value( batch_element) def aggregate(self, batch_element, batch_start=None, batch_end=None): + + # TODO(psv): Add num_samples check here to detect when output batch + # #samples is < batch size and != input batch #samples. + if self.batch_size and self.batch_size < batch_element.shape[0]: + raise ValueError( + 'Mismatch between expected batch size and model output batch size. ' + 'Output shape = {}, expected output shape = shape {}'.format( + batch_element.shape, + (self.batch_size,) + batch_element.shape[1:])) self.results.append(batch_element) def finalize(self): @@ -203,17 +231,20 @@ class SliceAggregator(Aggregator): _BINARY_SIZE_THRESHOLD = 2 ** 14 _MAX_COPY_SECONDS = 300 - def __init__(self, num_samples_or_steps): + def __init__(self, num_samples, batch_size): self._async_copies = [] self._pool = get_copy_pool() self._errors = [] super(SliceAggregator, self).__init__( - use_steps=False, num_samples_or_steps=num_samples_or_steps) + use_steps=False, + num_samples=num_samples, + steps=None, + batch_size=batch_size) def create(self, batch_element): # This step does not need to be pipelined because NumPy empty array # initialization is effectively instantaneous. - shape = (self.num_samples_or_steps,) + batch_element.shape[1:] + shape = (self.num_samples,) + batch_element.shape[1:] dtype = batch_element.dtype if isinstance(batch_element, ops.EagerTensor): dtype = dtype.as_numpy_dtype() @@ -226,8 +257,8 @@ class SliceAggregator(Aggregator): six.reraise(type(self._errors[0]), self._errors[0]) # In the special case of single batch inference, no copy is needed. - if batch_end - batch_start == self.num_samples_or_steps: - if self.num_samples_or_steps != batch_element.shape[0]: + if batch_end - batch_start == self.num_samples: + if self.num_samples != batch_element.shape[0]: raise ValueError( 'Mismatch between expected batch size and model output batch size. ' 'Output shape = {}, expected output shape = shape {}'.format( @@ -291,10 +322,11 @@ class OutputsAggregator(Aggregator): # If the output is not a ndarray, it will be either a composite tensor # or a composite tensor's Value object. In either case, we can't # allocate an array to hold the object - we'll handle it later. - self.results.append(ConcatAggregator()) + self.results.append(ConcatAggregator(self.batch_size)) elif isinstance(batch_element, (np.ndarray, ops.EagerTensor)): - self.results.append(ConcatAggregator() if self.use_steps else - SliceAggregator(self.num_samples_or_steps)) + self.results.append( + (ConcatAggregator(self.batch_size) if self.use_steps else + SliceAggregator(self.num_samples, self.batch_size))) else: # This is not a ndarray, a CompositeTensor, or a CompositeTensorValue. # Fail fast rather than trying to concatenate it. diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py index 0ef0066829a..1a6917e2e21 100644 --- a/tensorflow/python/keras/engine/training_utils_test.py +++ b/tensorflow/python/keras/engine/training_utils_test.py @@ -309,8 +309,7 @@ class AggregationTest(keras_parameterized.TestCase): training_utils.SliceAggregator._MAX_COPY_SECONDS = self._old_timeout def _run_with_steps(self): - aggregator = training_utils.OutputsAggregator( - use_steps=True, num_samples_or_steps=None) + aggregator = training_utils.OutputsAggregator(use_steps=True) for i, batch in enumerate(np.array_split(_TEST_DATA, 4)): if i == 0: aggregator.create(batch) @@ -324,7 +323,7 @@ class AggregationTest(keras_parameterized.TestCase): def _run_without_steps(self): aggregator = training_utils.OutputsAggregator( - use_steps=False, num_samples_or_steps=6) + use_steps=False, num_samples=6) batch_start = 0 for i, batch in enumerate(np.array_split(_TEST_DATA, 4)): @@ -349,7 +348,7 @@ class AggregationTest(keras_parameterized.TestCase): def test_nested_aggregation(self): aggregator = training_utils.OutputsAggregator( - use_steps=False, num_samples_or_steps=6) + use_steps=False, num_samples=6) batches = np.array_split(_TEST_DATA, 4) batch_start = 0 @@ -366,8 +365,7 @@ class AggregationTest(keras_parameterized.TestCase): self.assertAllEqual(aggregator.results, (_TEST_DATA, _TEST_DATA)) def test_concat_single_batch(self): - aggregator = training_utils.OutputsAggregator( - use_steps=True, num_samples_or_steps=None) + aggregator = training_utils.OutputsAggregator(use_steps=True) data = _TEST_DATA.copy() aggregator.create(data) assert len(aggregator.results) == 1 @@ -379,7 +377,7 @@ class AggregationTest(keras_parameterized.TestCase): def test_slice_single_batch(self): aggregator = training_utils.OutputsAggregator( - use_steps=False, num_samples_or_steps=6) + use_steps=False, num_samples=6) data = _TEST_DATA.copy() aggregator.create(data) assert len(aggregator.results) == 1 diff --git a/tensorflow/python/keras/engine/training_v2.py b/tensorflow/python/keras/engine/training_v2.py index ab362e29f75..dd07a94bae2 100644 --- a/tensorflow/python/keras/engine/training_v2.py +++ b/tensorflow/python/keras/engine/training_v2.py @@ -56,6 +56,7 @@ def run_one_epoch(model, iterator, execution_function, dataset_size=None, + batch_size=None, strategy=None, steps_per_epoch=None, mode=ModeKeys.TRAIN, @@ -72,6 +73,7 @@ def run_one_epoch(model, iterator: the dataset iterator to fetch the data. execution_function: a tf.function that can be called with data. dataset_size: the size of iterator, None when unknown. + batch_size: The size of the current batch. strategy: the distribution strategy instance from the model. steps_per_epoch: the number of steps to run for the epoch. mode: the mode for the current epoch. @@ -84,10 +86,10 @@ def run_one_epoch(model, """ if mode == ModeKeys.PREDICT: aggregator = training_utils.OutputsAggregator( - use_steps=True, num_samples_or_steps=steps_per_epoch) + use_steps=True, steps=steps_per_epoch, batch_size=batch_size) else: aggregator = training_utils.MetricsAggregator( - use_steps=True, num_samples_or_steps=steps_per_epoch) + use_steps=True, steps=steps_per_epoch) callbacks = training_context.callbacks progbar = training_context.progbar @@ -118,7 +120,7 @@ def run_one_epoch(model, # The input passed by the user ran out of batches. # Now we know the cardinality of the input(dataset or generator). steps_per_epoch = step - aggregator.num_samples_or_steps = steps_per_epoch + aggregator.steps = steps_per_epoch progbar.params['steps'] = steps_per_epoch progbar.progbar.target = steps_per_epoch else: @@ -281,6 +283,7 @@ class Loop(training_utils.TrainingLoop): training_data_iter, training_function, dataset_size=training_data_adapter.get_size(), + batch_size=training_data_adapter.batch_size(), strategy=strategy, steps_per_epoch=steps_per_epoch, mode=ModeKeys.TRAIN, @@ -310,6 +313,7 @@ class Loop(training_utils.TrainingLoop): eval_data_iter, eval_function, dataset_size=validation_adapter.get_size(), + batch_size=validation_adapter.batch_size(), strategy=strategy, steps_per_epoch=validation_steps, mode=ModeKeys.TEST, @@ -384,6 +388,7 @@ class Loop(training_utils.TrainingLoop): data_iterator, execution_function, dataset_size=adapter.get_size(), + batch_size=adapter.batch_size(), strategy=strategy, steps_per_epoch=steps, mode=mode, From 3782019ca739f2b00f1f1990d737e7be65e09df9 Mon Sep 17 00:00:00 2001 From: Pavithra Vijay Date: Mon, 22 Jul 2019 11:48:08 -0700 Subject: [PATCH 0320/3053] 1. Add support for temporal sample weight mode in non-graph networks. 2. Add correctness tests for temporal sample weight. PiperOrigin-RevId: 259373595 --- tensorflow/python/keras/BUILD | 12 + tensorflow/python/keras/engine/training.py | 32 +- ...emporal_sample_weights_correctness_test.py | 537 ++++++++++++++++++ 3 files changed, 566 insertions(+), 15 deletions(-) create mode 100644 tensorflow/python/keras/temporal_sample_weights_correctness_test.py diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD index 35866d35d3f..e0d9c0a3872 100755 --- a/tensorflow/python/keras/BUILD +++ b/tensorflow/python/keras/BUILD @@ -714,6 +714,18 @@ tf_py_test( shard_count = 4, ) +tf_py_test( + name = "temporal_sample_weights_correctness_test", + size = "medium", + srcs = ["temporal_sample_weights_correctness_test.py"], + additional_deps = [ + ":keras", + "//third_party/py/numpy", + "//tensorflow/python:client_testlib", + ], + shard_count = 12, +) + tf_py_test( name = "applications_test", size = "medium", diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py index a415358ff03..cdc06daae6a 100644 --- a/tensorflow/python/keras/engine/training.py +++ b/tensorflow/python/keras/engine/training.py @@ -1490,7 +1490,8 @@ class Model(network.Network): return if sample_weights and any([s is not None for s in sample_weights]): for endpoint in self._training_endpoints: - endpoint.sample_weight_mode = self.sample_weight_mode or 'samplewise' + endpoint.sample_weight_mode = ( + endpoint.sample_weight_mode or 'samplewise') else: for endpoint in self._training_endpoints: endpoint.sample_weight_mode = None @@ -1774,7 +1775,7 @@ class Model(network.Network): else: sample_weights = [None] * len(self._training_endpoints) for endpoint, weight in zip(self._training_endpoints, sample_weights): - endpoint.populate_sample_weight(weight) + endpoint.populate_sample_weight(weight, endpoint.sample_weight_mode) def _cache_output_metric_attributes(self, metrics, weighted_metrics): """Caches metric name and function attributes for every model output.""" @@ -2424,6 +2425,7 @@ class Model(network.Network): weighted_metrics=self._compile_weighted_metrics, loss_weights=self.loss_weights, target_tensors=target_tensors, + sample_weight_mode=self.sample_weight_mode, run_eagerly=self.run_eagerly, run_distributed=self._run_distributed) @@ -2491,16 +2493,16 @@ class Model(network.Network): nest.assert_same_structure(a, b, expand_composites=True) if y is not None: + # Prepare self._sample_weight_modes. List with the same length as + # model outputs. + training_utils.prepare_sample_weight_modes(self._training_endpoints, + self.sample_weight_mode) + feed_output_names = self._feed_output_names + feed_sample_weight_modes = self._sample_weight_modes if not self._is_graph_network: - feed_output_names = self._feed_output_names feed_output_shapes = None - # Sample weighting not supported in this case. - # TODO(fchollet): consider supporting it. - feed_sample_weight_modes = [None for _ in self.outputs] else: - feed_output_names = self._feed_output_names feed_output_shapes = self._feed_output_shapes - feed_sample_weight_modes = self._sample_weight_modes # Standardize the outputs. y = training_utils.standardize_input_data( @@ -3022,20 +3024,20 @@ class _TrainingEndpoint(object): (self.sample_weight_mode is not None and self.sample_weight is None) or (self.sample_weight_mode is None and self.sample_weight is not None)) - def populate_sample_weight(self, sample_weight=None): + def populate_sample_weight(self, sample_weight, sample_weight_mode): """Populate the sample weight and based on the sample weight mode.""" - if (sample_weight is None and (self.should_skip_target_weights() or - self.sample_weight_mode is None or - context.executing_eagerly())): + if (sample_weight is None and + (self.should_skip_target_weights() or sample_weight_mode is None or + context.executing_eagerly())): self._sample_weight = None return - assert self.sample_weight_mode in ['temporal', 'samplewise'] - if self.sample_weight_mode == 'temporal': + assert sample_weight_mode in ['temporal', 'samplewise'] + if sample_weight_mode == 'temporal': default_value = [[1.]] shape = [None, None] else: - # self.sample_weight_mode == 'samplewise' + # sample_weight_mode == 'samplewise' default_value = [1.] shape = [None] diff --git a/tensorflow/python/keras/temporal_sample_weights_correctness_test.py b/tensorflow/python/keras/temporal_sample_weights_correctness_test.py new file mode 100644 index 00000000000..e7029516306 --- /dev/null +++ b/tensorflow/python/keras/temporal_sample_weights_correctness_test.py @@ -0,0 +1,537 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests temporal sample weights correctness using Keras model.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.python import tf2 +from tensorflow.python.keras import keras_parameterized +from tensorflow.python.keras import layers +from tensorflow.python.keras import metrics +from tensorflow.python.keras import optimizer_v2 +from tensorflow.python.keras import testing_utils +from tensorflow.python.platform import test + + +class Bias(layers.Layer): + """Layer that add a bias to its inputs.""" + + def build(self, input_shape): + self.bias = self.add_variable('bias', (1,), initializer='zeros') + + def call(self, inputs): + return inputs + self.bias + + def compute_output_shape(self, input_shape): + return input_shape + + +def get_multi_io_temporal_model(): + timesteps = 2 + inp_1 = layers.Input(shape=(1,), name='input_1') + inp_2 = layers.Input(shape=(1,), name='input_2') + x = layers.RepeatVector(timesteps) + out_1 = layers.TimeDistributed(Bias(), name='output_1') + out_2 = layers.TimeDistributed(Bias(), name='output_2') + + branch_a = [inp_1, x, out_1] + branch_b = [inp_2, x, out_2] + return testing_utils.get_multi_io_model(branch_a, branch_b) + + +def get_compiled_multi_io_model_temporal(sample_weight_mode): + model = get_multi_io_temporal_model() + model.compile( + optimizer=optimizer_v2.gradient_descent.SGD(0.1), + loss='mae', + metrics=[metrics.MeanAbsoluteError(name='mae')], + weighted_metrics=[metrics.MeanAbsoluteError(name='mae_2')], + sample_weight_mode=sample_weight_mode, + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) + return model + + +def run_with_different_sample_weight_mode_inputs(fn, partial_sw=True): + """Executes the given function with different sample weight mode inputs. + + Args: + fn: Training or eval function to execute. + partial_sw: Boolean flag to indicate whether temporal sample weight mode + should be set partially just for one output. + """ + model = get_compiled_multi_io_model_temporal(sample_weight_mode='temporal') + fn(model) + + model = get_compiled_multi_io_model_temporal( + sample_weight_mode=['temporal', 'temporal']) + fn(model) + + model = get_compiled_multi_io_model_temporal(sample_weight_mode={ + 'output_1': 'temporal', + 'output_2': 'temporal' + }) + fn(model) + + if partial_sw: + model = get_compiled_multi_io_model_temporal( + sample_weight_mode=[None, 'temporal']) + fn(model) + + # TODO(b/129700800): Enable after bug is fixed. + # model = get_compiled_multi_io_model_temporal(sample_weight_mode={ + # 'output_2': 'temporal' + # }) + # fn(model) + + +@keras_parameterized.run_with_all_model_types(exclude_models=['sequential']) +@keras_parameterized.run_all_keras_modes +class TestMetricsCorrectnessMultiIOTemporal(keras_parameterized.TestCase): + + def custom_generator_multi_io_temporal(self, sample_weights=None): + """Generator for getting data for temporal multi io model. + + Args: + sample_weights: List of sample_weights. + + Yields: + Tuple of inputs, label, sample weights data. + """ + batch_size = 3 + num_samples = 3 + if sample_weights: + assert len(sample_weights) == 2 + w1 = sample_weights[0] + w2 = sample_weights[1] + else: + w1 = None + w2 = None + iteration = 0 + while True: + batch_index = iteration * batch_size % num_samples + iteration += 1 + start = batch_index + end = start + batch_size + x = [self.x[start:end], self.x[start:end]] + y = [self.y1[start:end], self.y2[start:end]] + if sample_weights: + w = [ + None if w1 is None else w1[start:end], + None if w2 is None else w2[start:end] + ] + else: + w = None + yield x, y, w + + def setUp(self): + super(TestMetricsCorrectnessMultiIOTemporal, self).setUp() + + self.x = np.asarray([[0.], [1.], [2.]]) + self.y1 = np.asarray([[[.5], [1.]], [[2.], [2.5]], [[3.5], [2.5]]]) + self.y2 = np.asarray([[[.5], [1.5]], [[2.], [1.5]], [[3.5], [3.]]]) + + if tf2.enabled(): + self.wmae = 'mae_2' + else: + self.wmae = 'weighted_mae_2' + + # Without weights: + # Epoch 1 - bias = 0 + # y_pred_1 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]] + # y_pred_2 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]] + # mae (y1 - y_pred_1) = [[[.5], [1.]], [[1.], [1.5]], [[1.5], [.5]]] + # mae = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1 + # mae_2 (y2 - y_pred_2) = [[[.5], [1.5]], [[1.], [.5]], [[1.5], [1.]]] + # mae_2 = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1 + + # Epoch 2 - bias = 0.1 (2/2 * 0.1) + # y_pred_1 = [[[.1], [.1]], [[1.1], [1.1]], [[2.1], [2.1]]] + # y_pred_2 = [[[.1], [.1]], [[1.1], [1.1]], [[2.1], [2.1]]] + # mae (y1 - y_pred_1) = [[[.4], [.9]], [[.9], [1.4]], [[1.4], [.4]]] + # mae = [[2.7/3, 2.7/3]] = [[0.9, 0.9]] = 1.8/2 = 0.9 + # mae_2 (y2 - y_pred_2) = [[[.4], [1.4]], [[.9], [.4]], [[1.4], [.9]]] + # mae_2 = [[2.7/3, 2.7/3]] = [[0.9, 0.9]] = 1.8/2 = 0.9 + + self.expected_fit_result = { + 'output_1_mae': [1, 0.9], + 'output_2_mae': [1, 0.9], + 'output_1_' + self.wmae: [1, 0.9], + 'output_2_' + self.wmae: [1, 0.9], + 'loss': [2., 1.8], + 'output_1_loss': [1, 0.9], + 'output_2_loss': [1, 0.9], + } + + self.sample_weight_1 = np.asarray([[.5, 2.], [.5, 2.], [.5, 2.]]) + self.sample_weight_2 = np.asarray([[2., .5], [2., .5], [2., .5]]) + + # With weights: + # Epoch 1 + # y_pred_1 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]] + # y_pred_2 = [[[0.], [0.]], [[1.], [1.]], [[2.], [2.]]] + # mae (y1 - y_pred_1) = [[[.5], [1.]], [[1.], [1.5]], [[1.5], [.5]]] + # with weights = [[[.5 * .5], [1 * 2]], + # [[1 * .5], [1.5 * 2]], + # [[1.5 * .5], [.5 * 2]]] + # mae (w/o weights) = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1 + # mae (weighted mean) = [[1.5/1.5, 6/6]] = [[1, 1]] = 2/2 = 1 + # mae (sum over bs) = [[1.5/3, 6/3]] = [[.5, 2]] = 2.5/2 = 1.25 + + # mae_2 (y2 - y_pred_2) = [[[.5], [1.5]], [[1.], [.5]], [[1.5], [1.]]] + # with weights = [[[.5 * 2], [1.5 * .5]], + # [[1. * 2], [.5 * .5]], + # [[1.5 * 2], [1. * .5]]] + # mae_2 (w/o weights) = [[3/3, 3/3]] = [[1, 1]] = 2/2 = 1 + # mae_2 (weighted mean) = [[6/6, 1.5/1.5]] = [[1, 1]] = 2/2 = 1 + # mae_2 (sum over bs) = [[6/3, 1.5/3]] = [[2, .5]] = 2.5/2 = 1.25 + + # Epoch 2 - bias = 0.125 (2.5/2 * 0.1) + # y_pred_1 = [[[0.125], [0.125]], [[1.125], [1.125]], [[2.125], [2.125]]] + # y_pred_2 = [[[0.125], [0.125]], [[1.125], [1.125]], [[2.125], [2.125]]] + + # mae (y1 - y_pred_1) = [[[.375], [.875]], + # [[.875], [1.375]], + # [[1.375], [.375]]] + # with weights = [[[.375 * .5], [.875 * 2.]], + # [[.875 * .5], [1.375 * 2.]], + # [[1.375 * .5], [.375 * 2.]]] + # mae (w/o weights) = [[2.625/3, 2.625/3]] = (.875+.875)/2 = .875 + # mae (weighted mean) = [[1.3125/1.5, 5.25/6]] = (.875+.875)/2 = .875 + # mae (sum over bs) = [[1.3125/3, 5.25/3]] = (0.4375+1.75)/2 = 1.09375 + + # mae_2 (y2 - y_pred_2) = [[[.375], [1.375]], + # [[.875], [.375]], + # [[1.375], [.875]]] + # with weights = [[[.375 * 2.], [1.375 * .5]], + # [[.875 * 2.], [.375 * .5]], + # [[1.375 * 2.], [.875 * .5]]] + # mae_2 (w/o weights) = [[2.625/3, 2.625/3]] = (.875+.875)/2 = .875 + # mae_2 (weighted mean) = [[5.25/6, 1.3125/1.5]] = (.875+.875)/2 = .875 + # mae_2 (sum over bs) = [[5.25/3, 1.3125/3]] = (1.75+0.4375)/2 = 1.09375 + + self.expected_fit_result_with_weights = { + 'output_1_mae': [1, 0.875], + 'output_2_mae': [1, 0.875], + 'output_1_' + self.wmae: [1, 0.875], + 'output_2_' + self.wmae: [1, 0.875], + 'loss': [2.5, 2.1875], + 'output_1_loss': [1.25, 1.09375], + 'output_2_loss': [1.25, 1.09375], + } + + self.expected_fit_result_with_weights_output_2 = { + 'output_1_mae': [1., 0.9], + 'output_2_mae': [1, 0.875], + 'output_1_' + self.wmae: [1., 0.9], + 'output_2_' + self.wmae: [1., 0.875], + 'loss': [2.25, 1.99375], + 'output_1_loss': [1., 0.9], + 'output_2_loss': [1.25, 1.09375], + } + + # In the order: 'loss', 'output_1_loss', 'output_2_loss', + # 'output_1_mae', 'output_1_mae_2', + # 'output_2_mae', 'output_2_mae_2' + self.expected_batch_result_with_weights = [ + 2.1875, 1.09375, 1.09375, 0.875, 0.875, 0.875, 0.875 + ] + self.expected_batch_result_with_weights_output_2 = [ + 1.99375, 0.9, 1.09375, 0.9, 0.9, 0.875, 0.875 + ] + self.expected_batch_result = [1.8, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9] + + def test_fit(self): + + def _train_and_assert(model): + history = model.fit([self.x, self.x], [self.y1, self.y2], + batch_size=3, + epochs=2, + shuffle=False) + for key, value in self.expected_fit_result.items(): + self.assertAllClose(history.history[key], value, 1e-3) + + run_with_different_sample_weight_mode_inputs(_train_and_assert) + + def test_fit_with_sample_weight(self): + + def _train_and_assert(model): + history = model.fit([self.x, self.x], [self.y1, self.y2], + sample_weight={ + 'output_1': self.sample_weight_1, + 'output_2': self.sample_weight_2, + }, + batch_size=3, + epochs=2, + shuffle=False) + for key, value in self.expected_fit_result_with_weights.items(): + self.assertAllClose(history.history[key], value, 1e-3) + + run_with_different_sample_weight_mode_inputs( + _train_and_assert, partial_sw=False) + + def test_fit_with_partial_sample_weight(self): + + def _train_and_assert(model): + history = model.fit([self.x, self.x], [self.y1, self.y2], + sample_weight={ + 'output_2': self.sample_weight_2, + }, + batch_size=3, + epochs=2, + shuffle=False) + for key, value in self.expected_fit_result_with_weights_output_2.items(): + self.assertAllClose(history.history[key], value, 1e-3) + + run_with_different_sample_weight_mode_inputs(_train_and_assert) + + def test_eval(self): + + def _eval_and_assert(model): + model.train_on_batch([self.x, self.x], [self.y1, self.y2]) + eval_result = model.evaluate([self.x, self.x], [self.y1, self.y2], + batch_size=3) + self.assertAllClose(eval_result, self.expected_batch_result, 1e-3) + + run_with_different_sample_weight_mode_inputs(_eval_and_assert) + + def test_eval_with_sample_weight(self): + + def _eval_and_assert(model): + model.train_on_batch([self.x, self.x], [self.y1, self.y2], + sample_weight={ + 'output_1': self.sample_weight_1, + 'output_2': self.sample_weight_2, + }) + eval_result = model.evaluate([self.x, self.x], [self.y1, self.y2], + batch_size=3, + sample_weight={ + 'output_1': self.sample_weight_1, + 'output_2': self.sample_weight_2, + }) + self.assertAllClose(eval_result, self.expected_batch_result_with_weights, + 1e-3) + + run_with_different_sample_weight_mode_inputs( + _eval_and_assert, partial_sw=False) + + def test_eval_with_partial_sample_weight(self): + + def _eval_and_assert(model): + model.train_on_batch([self.x, self.x], [self.y1, self.y2], + sample_weight={ + 'output_2': self.sample_weight_2, + }) + eval_result = model.evaluate([self.x, self.x], [self.y1, self.y2], + batch_size=3, + sample_weight={ + 'output_2': self.sample_weight_2, + }) + self.assertAllClose(eval_result, + self.expected_batch_result_with_weights_output_2, + 1e-3) + + run_with_different_sample_weight_mode_inputs(_eval_and_assert) + + def test_train_on_batch(self): + + def _train_and_assert(model): + for _ in range(2): + result = model.train_on_batch([self.x, self.x], [self.y1, self.y2]) + self.assertAllClose(result, self.expected_batch_result, 1e-3) + + run_with_different_sample_weight_mode_inputs(_train_and_assert) + + def test_train_on_batch_with_sample_weight(self): + + def _train_and_assert(model): + for _ in range(2): + result = model.train_on_batch([self.x, self.x], [self.y1, self.y2], + sample_weight={ + 'output_1': self.sample_weight_1, + 'output_2': self.sample_weight_2, + }) + self.assertAllClose(result, self.expected_batch_result_with_weights, 1e-3) + + run_with_different_sample_weight_mode_inputs( + _train_and_assert, partial_sw=False) + + def test_train_on_batch_with_partial_sample_weight(self): + + def _train_and_assert(model): + for _ in range(2): + result = model.train_on_batch([self.x, self.x], [self.y1, self.y2], + sample_weight={ + 'output_2': self.sample_weight_2, + }) + self.assertAllClose(result, + self.expected_batch_result_with_weights_output_2, + 1e-3) + + run_with_different_sample_weight_mode_inputs(_train_and_assert) + + def test_test_on_batch(self): + + def _test_and_assert(model): + model.train_on_batch([self.x, self.x], [self.y1, self.y2]) + result = model.test_on_batch([self.x, self.x], [self.y1, self.y2]) + self.assertAllClose(result, self.expected_batch_result, 1e-3) + + run_with_different_sample_weight_mode_inputs(_test_and_assert) + + def test_test_on_batch_with_sample_weight(self): + + def _test_and_assert(model): + model.train_on_batch([self.x, self.x], [self.y1, self.y2], + sample_weight={ + 'output_1': self.sample_weight_1, + 'output_2': self.sample_weight_2, + }) + result = model.test_on_batch([self.x, self.x], [self.y1, self.y2], + sample_weight={ + 'output_1': self.sample_weight_1, + 'output_2': self.sample_weight_2, + }) + self.assertAllClose(result, self.expected_batch_result_with_weights, 1e-3) + + run_with_different_sample_weight_mode_inputs( + _test_and_assert, partial_sw=False) + + def test_test_on_batch_with_partial_sample_weight(self): + + def _test_and_assert(model): + model.train_on_batch([self.x, self.x], [self.y1, self.y2], + sample_weight={ + 'output_2': self.sample_weight_2, + }) + result = model.test_on_batch([self.x, self.x], [self.y1, self.y2], + sample_weight={ + 'output_2': self.sample_weight_2, + }) + self.assertAllClose(result, + self.expected_batch_result_with_weights_output_2, + 1e-3) + + run_with_different_sample_weight_mode_inputs(_test_and_assert) + + def test_fit_generator(self): + + def _train_and_assert(model): + history = model.fit_generator( + self.custom_generator_multi_io_temporal(), + steps_per_epoch=1, + epochs=2) + for key, value in self.expected_fit_result.items(): + self.assertAllClose(history.history[key], value, 1e-3) + + run_with_different_sample_weight_mode_inputs(_train_and_assert) + + def test_fit_generator_with_sample_weight(self): + + def _train_and_assert(model): + history = model.fit_generator( + self.custom_generator_multi_io_temporal( + sample_weights=[self.sample_weight_1, self.sample_weight_2]), + steps_per_epoch=1, + epochs=2) + for key, value in self.expected_fit_result_with_weights.items(): + self.assertAllClose(history.history[key], value, 1e-3) + + run_with_different_sample_weight_mode_inputs( + _train_and_assert, partial_sw=False) + + def test_fit_generator_with_partial_sample_weight(self): + + def _train_and_assert(model): + history = model.fit_generator( + self.custom_generator_multi_io_temporal( + sample_weights=[None, self.sample_weight_2]), + steps_per_epoch=1, + epochs=2) + for key, value in self.expected_fit_result_with_weights_output_2.items(): + self.assertAllClose(history.history[key], value, 1e-3) + + run_with_different_sample_weight_mode_inputs(_train_and_assert) + + def test_eval_generator(self): + + def _test_and_assert(model): + model.train_on_batch([self.x, self.x], [self.y1, self.y2]) + eval_result = model.evaluate_generator( + self.custom_generator_multi_io_temporal(), steps=1) + self.assertAllClose(eval_result, self.expected_batch_result, 1e-3) + + run_with_different_sample_weight_mode_inputs(_test_and_assert) + + def test_eval_generator_with_sample_weight(self): + + def _test_and_assert(model): + model.train_on_batch([self.x, self.x], [self.y1, self.y2], + sample_weight={ + 'output_1': self.sample_weight_1, + 'output_2': self.sample_weight_2, + }) + eval_result = model.evaluate_generator( + self.custom_generator_multi_io_temporal( + sample_weights=[self.sample_weight_1, self.sample_weight_2]), + steps=2) + self.assertAllClose(eval_result, self.expected_batch_result_with_weights, + 1e-3) + + run_with_different_sample_weight_mode_inputs( + _test_and_assert, partial_sw=False) + + def test_eval_generator_with_partial_sample_weight(self): + + def _test_and_assert(model): + model.train_on_batch([self.x, self.x], [self.y1, self.y2], + sample_weight={ + 'output_2': self.sample_weight_2, + }) + eval_result = model.evaluate_generator( + self.custom_generator_multi_io_temporal( + sample_weights=[None, self.sample_weight_2]), + steps=2) + self.assertAllClose(eval_result, + self.expected_batch_result_with_weights_output_2, + 1e-3) + + run_with_different_sample_weight_mode_inputs(_test_and_assert) + + def test_error_on_fit_with_class_weight(self): + + def _train_and_assert(model): + with self.assertRaisesRegex( + ValueError, + r'`class_weight` not supported for 3\+ dimensional targets.'): + model.fit([self.x, self.x], [self.y1, self.y2], + class_weight={'output_1': { + .5: .5, + 2.: .5, + 3.5: .5 + }}, + batch_size=3, + epochs=2, + shuffle=False) + + run_with_different_sample_weight_mode_inputs(_train_and_assert) + + +if __name__ == '__main__': + test.main() From 518f8b57d8007203bb1a148d3eb857866fcef16c Mon Sep 17 00:00:00 2001 From: Juhyun Lee Date: Mon, 22 Jul 2019 11:57:32 -0700 Subject: [PATCH 0321/3053] TFLite GPU: Sort parsers alphabetically. Also apply minor changes. - Add a blank line after each IsSupported() of derived classes of TFLiteOperationParser. - Add public: to a couple or three classes. - Rename Lstm to LSTM. - Rename PReLu to PReLU. - Rename ReLu to ReLU. - Rename SoftMax to Softmax. PiperOrigin-RevId: 259375388 --- .../delegates/gpu/common/model_builder.cc | 1417 +++++++++-------- 1 file changed, 716 insertions(+), 701 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc index a987c274a75..159eec57885 100644 --- a/tensorflow/lite/delegates/gpu/common/model_builder.cc +++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc @@ -391,7 +391,7 @@ Status CheckInputsOutputs(const TfLiteContext* context, // A parser responsible for parsing TFLite operation and adding it to a graph. class TFLiteOperationParser { public: - virtual ~TFLiteOperationParser() {} + virtual ~TFLiteOperationParser() = default; // Parses TFLite operation. This method allows expanding fused operations // into more than one node. @@ -593,52 +593,6 @@ Status CheckKernelsAndStrides(int kernel_h, int kernel_w, int strides_h, return OkStatus(); } -class Conv2DOperationParser : public TFLiteOperationParser { - public: - Status IsSupported(const TfLiteContext* context, - const TfLiteNode* tflite_node, - const TfLiteRegistration* registration) final { - RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); - RETURN_IF_ERROR( - CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1)); - RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1)); - TfLiteConvParams* tf_options = nullptr; - RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); - RETURN_IF_ERROR(CheckStridesAndDilation( - tf_options->stride_height, tf_options->stride_width, - tf_options->dilation_height_factor, tf_options->dilation_width_factor)); - return IsActivationSupported(tf_options->activation); - } - - Status Parse(const TfLiteNode* tflite_node, - const TfLiteRegistration* registration, GraphFloat32* graph, - ObjectReader* reader) final { - Node* node = graph->NewNode(); - node->operation.type = ToString(OperationType::CONVOLUTION_2D); - RETURN_IF_ERROR(reader->AddInput(node, 0)); - RETURN_IF_ERROR(reader->AddOutputs(node)); - - Convolution2DAttributes attr; - RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights)); - reader->ReadTensor(2, &attr.bias).IgnoreError(); // bias is optional - - const auto* tf_options = - reinterpret_cast(tflite_node->builtin_data); - if (!tf_options) { - return InternalError("Missing tflite params"); - } - attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width); - attr.dilations = HW(tf_options->dilation_height_factor, - tf_options->dilation_width_factor); - UpdatePadding(tf_options->padding, - graph->FindInputs(node->id)[0]->tensor.shape, &attr); - RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation, - graph, node)); - node->operation.attributes = std::move(attr); - return OkStatus(); - } -}; - // Creates a simple node that holds tensor value. Status NewConstNode(TensorFloat32 t, GraphFloat32* graph, Value>** value) { @@ -656,6 +610,115 @@ Status NewConstNode(TensorFloat32 t, GraphFloat32* graph, return OkStatus(); } +Status ParsePoolingAttributes(const TfLitePoolParams* tf_options, + const BHWC& input_shape, + Pooling2DAttributes* attr) { + attr->kernel = ToHW(tf_options->filter_height, tf_options->filter_width); + attr->strides = ToHW(tf_options->stride_height, tf_options->stride_width); + UpdatePadding(tf_options->padding, input_shape, attr); + return OkStatus(); +} + +Status ExtractTensorShape(const TfLiteTensor& tflite_tensor, BHWC* bhwc) { + const TfLiteIntArray* dims = tflite_tensor.dims; + switch (dims->size) { + case 1: + *bhwc = BHWC(dims->data[0], 1, 1, 1); + return OkStatus(); + case 2: + *bhwc = BHWC(dims->data[0], 1, 1, dims->data[1]); + return OkStatus(); + case 3: + *bhwc = BHWC(dims->data[0], 1, dims->data[1], dims->data[2]); + return OkStatus(); + case 4: + *bhwc = BHWC(dims->data[0], dims->data[1], dims->data[2], dims->data[3]); + return OkStatus(); + default: + return InvalidArgumentError(absl::StrCat( + "Tensor \"", tflite_tensor.name ? tflite_tensor.name : "nullptr", + "\" has bad input dims size: ", dims->size, ".")); + } +} + +class AddOperationParser : public TFLiteOperationParser { + public: + Status IsSupported(const TfLiteContext* context, + const TfLiteNode* tflite_node, + const TfLiteRegistration* registration) final { + RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); + if (tflite_node->inputs->size != 2) { + return UnimplementedError("ADD requires two input tensors."); + } + // TODO(eignasheva): Add shapes check. + TfLiteAddParams* tf_options = nullptr; + return RetrieveBuiltinData(tflite_node, &tf_options); + } + + Status Parse(const TfLiteNode* tflite_node, + const TfLiteRegistration* registration, GraphFloat32* graph, + ObjectReader* reader) final { + // TFLite currently only supports 2 input ADDs. Thus, the logic below only + // considers 2 input cases. The underlying GPU shader programs can accept + // more inputs, but the logic below would have to be expanded. + + // Determine runtime/constant tensors. + const TfLiteTensor* input0 = reader->GetInputTensor(0); + if (!input0) { + return InvalidArgumentError("Couldn't get the 1st input tensor for ADD."); + } + const TfLiteTensor* input1 = reader->GetInputTensor(1); + if (!input1) { + return InvalidArgumentError("Couldn't get the 2nd input tensor for ADD."); + } + const bool constant_tensor0 = IsConstantTensor(input0); + const bool constant_tensor1 = IsConstantTensor(input1); + if (constant_tensor0 && constant_tensor1) { + return InvalidArgumentError("No runtime input tensors for ADD."); + } + const bool runtime_tensor0 = !constant_tensor0; + const bool runtime_tensor1 = !constant_tensor1; + + Node* node = graph->NewNode(); + node->operation.type = ToString(OperationType::ADD); + RETURN_IF_ERROR(reader->AddOutputs(node)); + + AddAttributes attr; + if (runtime_tensor0 && runtime_tensor1) { + RETURN_IF_ERROR(reader->AddInput(node, 0)); + RETURN_IF_ERROR(reader->AddInput(node, 1)); + } else { + int runtime_tensor = 0; + int constant_tensor = 1; + TfLiteIntArray* constant_dims = input1->dims; + if (constant_tensor0 && runtime_tensor1) { + runtime_tensor = 1; + constant_tensor = 0; + constant_dims = input0->dims; + } + RETURN_IF_ERROR(reader->AddInput(node, runtime_tensor)); + if (constant_dims->size <= 0) { + Tensor tensor; + RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor)); + attr.param = tensor.data[0]; + } else { + Tensor tensor; + RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor)); + attr.param = std::move(tensor); + } + } + node->operation.attributes = std::move(attr); + + const auto* tf_options = + reinterpret_cast(tflite_node->builtin_data); + if (!tf_options) { + return InternalError("Missing tflite params"); + } + return MaybeFuseActivationToTheSingleOutput(tf_options->activation, graph, + node); + } +}; + class ConcatenationOperationParser : public TFLiteOperationParser { public: Status IsSupported(const TfLiteContext* context, @@ -777,6 +840,90 @@ class ConcatenationOperationParser : public TFLiteOperationParser { } }; +class Conv2DOperationParser : public TFLiteOperationParser { + public: + Status IsSupported(const TfLiteContext* context, + const TfLiteNode* tflite_node, + const TfLiteRegistration* registration) final { + RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); + RETURN_IF_ERROR( + CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1)); + RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1)); + TfLiteConvParams* tf_options = nullptr; + RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); + RETURN_IF_ERROR(CheckStridesAndDilation( + tf_options->stride_height, tf_options->stride_width, + tf_options->dilation_height_factor, tf_options->dilation_width_factor)); + return IsActivationSupported(tf_options->activation); + } + + Status Parse(const TfLiteNode* tflite_node, + const TfLiteRegistration* registration, GraphFloat32* graph, + ObjectReader* reader) final { + Node* node = graph->NewNode(); + node->operation.type = ToString(OperationType::CONVOLUTION_2D); + RETURN_IF_ERROR(reader->AddInput(node, 0)); + RETURN_IF_ERROR(reader->AddOutputs(node)); + + Convolution2DAttributes attr; + RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights)); + reader->ReadTensor(2, &attr.bias).IgnoreError(); // bias is optional + + const auto* tf_options = + reinterpret_cast(tflite_node->builtin_data); + if (!tf_options) { + return InternalError("Missing tflite params"); + } + attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width); + attr.dilations = HW(tf_options->dilation_height_factor, + tf_options->dilation_width_factor); + UpdatePadding(tf_options->padding, + graph->FindInputs(node->id)[0]->tensor.shape, &attr); + RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation, + graph, node)); + node->operation.attributes = std::move(attr); + return OkStatus(); + } +}; + +class Convolution2DTransposeBiasParser : public TFLiteOperationParser { + public: + Status IsSupported(const TfLiteContext* context, + const TfLiteNode* tflite_node, + const TfLiteRegistration* registration) final { + RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1)); + TfLiteTransposeConvParams* tf_options = nullptr; + RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options)); + RETURN_IF_ERROR( + CheckStrides(tf_options->stride_height, tf_options->stride_width)); + return OkStatus(); + } + + Status Parse(const TfLiteNode* tflite_node, + const TfLiteRegistration* registration, GraphFloat32* graph, + ObjectReader* reader) final { + auto* node = graph->NewNode(); + node->operation.type = ToString(OperationType::CONVOLUTION_TRANSPOSED); + RETURN_IF_ERROR(reader->AddInput(node, 0)); + RETURN_IF_ERROR(reader->AddOutputs(node)); + + const auto* params = reinterpret_cast( + tflite_node->custom_initial_data); + ConvolutionTransposedAttributes attr; + attr.stride = + params ? HW(params->stride_height, params->stride_width) : HW(1, 1); + + RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights)); + reader->ReadTensor(2, &attr.bias).IgnoreError(); // bias is optional + + UpdatePadding(params->padding, graph->FindInputs(node->id)[0]->tensor.shape, + &attr); + + node->operation.attributes = std::move(attr); + return OkStatus(); + } +}; + class DepthwiseConvolutionOperationParser : public TFLiteOperationParser { public: Status IsSupported(const TfLiteContext* context, @@ -891,492 +1038,6 @@ class DepthwiseConvolutionOperationParser : public TFLiteOperationParser { } }; -class HardSwishOperationParser : public TFLiteOperationParser { - public: - Status IsSupported(const TfLiteContext* context, - const TfLiteNode* tflite_node, - const TfLiteRegistration*) final { - return CheckInputsOutputs(context, tflite_node, /*inputs=*/1, - /*outputs=*/1); - } - - Status Parse(const TfLiteNode*, const TfLiteRegistration*, - GraphFloat32* graph, ObjectReader* reader) final { - Node* node = graph->NewNode(); - node->operation.type = ToString(OperationType::HARD_SWISH); - RETURN_IF_ERROR(reader->AddInput(node, 0)); - return reader->AddOutputs(node); - } -}; - -class ReshapeOperationParser : public TFLiteOperationParser { - public: - Status IsSupported(const TfLiteContext* context, - const TfLiteNode* tflite_node, - const TfLiteRegistration* registration) final { - RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); - RETURN_IF_ERROR( - CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1)); - // TODO(eignasheva): add shape checking - return OkStatus(); - } - - Status Parse(const TfLiteNode* tflite_node, - const TfLiteRegistration* registration, GraphFloat32* graph, - ObjectReader* reader) final { - Node* node = graph->NewNode(); - node->operation.type = ToString(OperationType::RESHAPE); - RETURN_IF_ERROR(reader->AddInput(node, 0)); - RETURN_IF_ERROR(reader->AddOutputs(node)); - // Here we may have extra inputs. Other tensors were supposed to - // define new shape, but in TFLite these are ignored. - // TODO(akulik): check that shapes match? - - // New shape comes from output shape. - ReshapeAttributes attr; - attr.new_shape = graph->FindOutputs(node->id)[0]->tensor.shape; - node->operation.attributes = attr; - return OkStatus(); - } -}; - -Status ParsePoolingAttributes(const TfLitePoolParams* tf_options, - const BHWC& input_shape, - Pooling2DAttributes* attr) { - attr->kernel = ToHW(tf_options->filter_height, tf_options->filter_width); - attr->strides = ToHW(tf_options->stride_height, tf_options->stride_width); - UpdatePadding(tf_options->padding, input_shape, attr); - return OkStatus(); -} - -class Pooling2DOperationParser : public TFLiteOperationParser { - Status IsSupported(const TfLiteContext* context, - const TfLiteNode* tflite_node, - const TfLiteRegistration* registration) final { - RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); - TfLitePoolParams* tf_options = nullptr; - auto status = RetrieveCustomInitialData(tflite_node, &tf_options); - if (status.ok()) { // custom case with indices as a second output - RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/1, - /*outputs=*/2)); - } else { // common pooling with 1 output - RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); - RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/1, - /*outputs=*/1)); - } - RETURN_IF_ERROR(CheckKernelsAndStrides( - tf_options->filter_height, tf_options->filter_width, - tf_options->stride_height, tf_options->stride_width)); - return IsActivationSupported(tf_options->activation); - } - - public: - explicit Pooling2DOperationParser(PoolingType type) : type_(type) {} - - Status Parse(const TfLiteNode* tflite_node, - const TfLiteRegistration* registration, GraphFloat32* graph, - ObjectReader* reader) final { - Node* node = graph->NewNode(); - node->operation.type = ToString(OperationType::POOLING_2D); - RETURN_IF_ERROR(reader->AddInput(node, 0)); - RETURN_IF_ERROR(reader->AddOutput(node, 0)); - - Pooling2DAttributes attr; - attr.type = type_; - - auto input_shape = graph->FindInputs(node->id)[0]->tensor.shape; - - // check whether there are custom options encoded. It happens if operation - // is MaxPoolingWithArgmax2D. There is no way to read - // tflite_node->builtin_code, so, simply check whether custom data is - // available. - auto* tf_options = reinterpret_cast( - tflite_node->custom_initial_data); - if (!tf_options) { - tf_options = - reinterpret_cast(tflite_node->builtin_data); - } - if (!tf_options) { - return InternalError("Missing tflite params"); - } - - std::vector max_tensor_id{0}; - RETURN_IF_ERROR(MaybeFuseActivation(tf_options->activation, max_tensor_id, - graph, node)); - // Second output is optional. It is not required, it but must be added after - // MaybeAddFusedActivation function is called - reader->AddOutput(node, 1).IgnoreError(); - - // First output is the result of pooling operation, while second output is - // indices used for pooling. - auto outputs = graph->FindOutputs(node->id); - attr.output_indices = outputs.size() == 2; - if (attr.output_indices) { - // Fix data type for output indices. In the model it is set as float32. - outputs[1]->tensor.type = DataType::INT32; - } - RETURN_IF_ERROR(ParsePoolingAttributes(tf_options, input_shape, &attr)); - node->operation.attributes = attr; - return OkStatus(); - } - - private: - const PoolingType type_; -}; - -class Unpooling2DOperationParser : public TFLiteOperationParser { - public: - Status IsSupported(const TfLiteContext* context, - const TfLiteNode* tflite_node, - const TfLiteRegistration* registration) final { - TfLitePoolParams* tf_options = nullptr; - RETURN_IF_ERROR( - CheckInputsOutputs(context, tflite_node, /*inputs=*/2, /*outputs=*/1)); - RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options)); - RETURN_IF_ERROR(CheckKernelsAndStrides( - tf_options->filter_height, tf_options->filter_width, - tf_options->stride_height, tf_options->stride_width)); - return OkStatus(); - } - - Status Parse(const TfLiteNode* tflite_node, - const TfLiteRegistration* registration, GraphFloat32* graph, - ObjectReader* reader) final { - Node* node = graph->NewNode(); - node->operation.type = ToString(OperationType::MAX_UNPOOLING_2D); - RETURN_IF_ERROR(reader->AddInput(node, 0)); - RETURN_IF_ERROR(reader->AddInput(node, 1)); - RETURN_IF_ERROR(reader->AddOutputs(node)); - auto input_shape = graph->FindInputs(node->id)[0]->tensor.shape; - MaxUnpooling2DAttributes attr; - const auto* tf_options = reinterpret_cast( - tflite_node->custom_initial_data); - if (!tf_options) { - return InternalError("Missing tflite params"); - } - attr.kernel = ToHW(tf_options->filter_height, tf_options->filter_width); - attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width); - UpdatePadding(tf_options->padding, input_shape, &attr); - - node->operation.attributes = attr; - - auto output_value = graph->FindOutputs(node->id)[0]; - output_value->tensor.shape = CalculateOutputShape(input_shape, attr); - return OkStatus(); - } -}; - -class SoftMaxOperationParser : public TFLiteOperationParser { - public: - Status IsSupported(const TfLiteContext* context, - const TfLiteNode* tflite_node, - const TfLiteRegistration* registration) final { - RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); - RETURN_IF_ERROR( - CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1)); - TfLiteSoftmaxParams* tf_options = nullptr; - RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); - if (tf_options->beta != 1) { - // TODO(eignasheva): figure out, what's wrong with softmax. - return UnimplementedError("Softmax.beta != 1 is not supported."); - } - return OkStatus(); - } - Status Parse(const TfLiteNode* tflite_node, - const TfLiteRegistration* registration, GraphFloat32* graph, - ObjectReader* reader) final { - Node* node = graph->NewNode(); - node->operation.type = ToString(OperationType::SOFT_MAX); - RETURN_IF_ERROR(reader->AddInput(node, 0)); - RETURN_IF_ERROR(reader->AddOutputs(node)); - - const auto* tf_options = - reinterpret_cast(tflite_node->builtin_data); - if (!tf_options) { - return InternalError("Missing tflite params"); - } - if (tf_options->beta != 1) { - // there is multiply by scalar operation fused in SoftMax. Make a layer - // out of it before SoftMax. - return UnimplementedError("Softmax.beta != 1 is not supported."); - // auto mul_node = reader->NewPassthroughNode(node); - // mul_node->operation.type = ToString(OperationType::MUL); - } - SoftMaxAttributes attr; - attr.axis = Axis::CHANNELS; // always by channels - node->operation.attributes = attr; - return OkStatus(); - } -}; - -class AddOperationParser : public TFLiteOperationParser { - public: - Status IsSupported(const TfLiteContext* context, - const TfLiteNode* tflite_node, - const TfLiteRegistration* registration) final { - RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); - if (tflite_node->inputs->size != 2) { - return UnimplementedError("ADD requires two input tensors."); - } - // TODO(eignasheva): Add shapes check. - TfLiteAddParams* tf_options = nullptr; - return RetrieveBuiltinData(tflite_node, &tf_options); - } - - Status Parse(const TfLiteNode* tflite_node, - const TfLiteRegistration* registration, GraphFloat32* graph, - ObjectReader* reader) final { - // TFLite currently only supports 2 input ADDs. Thus, the logic below only - // considers 2 input cases. The underlying GPU shader programs can accept - // more inputs, but the logic below would have to be expanded. - - // Determine runtime/constant tensors. - const TfLiteTensor* input0 = reader->GetInputTensor(0); - if (!input0) { - return InvalidArgumentError("Couldn't get the 1st input tensor for ADD."); - } - const TfLiteTensor* input1 = reader->GetInputTensor(1); - if (!input1) { - return InvalidArgumentError("Couldn't get the 2nd input tensor for ADD."); - } - const bool constant_tensor0 = IsConstantTensor(input0); - const bool constant_tensor1 = IsConstantTensor(input1); - if (constant_tensor0 && constant_tensor1) { - return InvalidArgumentError("No runtime input tensors for ADD."); - } - const bool runtime_tensor0 = !constant_tensor0; - const bool runtime_tensor1 = !constant_tensor1; - - Node* node = graph->NewNode(); - node->operation.type = ToString(OperationType::ADD); - RETURN_IF_ERROR(reader->AddOutputs(node)); - - AddAttributes attr; - if (runtime_tensor0 && runtime_tensor1) { - RETURN_IF_ERROR(reader->AddInput(node, 0)); - RETURN_IF_ERROR(reader->AddInput(node, 1)); - } else { - int runtime_tensor = 0; - int constant_tensor = 1; - TfLiteIntArray* constant_dims = input1->dims; - if (constant_tensor0 && runtime_tensor1) { - runtime_tensor = 1; - constant_tensor = 0; - constant_dims = input0->dims; - } - RETURN_IF_ERROR(reader->AddInput(node, runtime_tensor)); - if (constant_dims->size <= 0) { - Tensor tensor; - RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor)); - attr.param = tensor.data[0]; - } else { - Tensor tensor; - RETURN_IF_ERROR(reader->ReadTensor(constant_tensor, &tensor)); - attr.param = std::move(tensor); - } - } - node->operation.attributes = std::move(attr); - - const auto* tf_options = - reinterpret_cast(tflite_node->builtin_data); - if (!tf_options) { - return InternalError("Missing tflite params"); - } - return MaybeFuseActivationToTheSingleOutput(tf_options->activation, graph, - node); - } -}; - -// Basic LSTM Cell: -// -// 1name = name is at input index 1 -// name1 = name is at output index 1 -// -// 0input 1prev_activ -// \ / -// [[concat]] -// \ -// concat_temp2 2weights 3biases -// \ / / -// [[fully-connected]] -// \ -// activ_temp3 4prev_state -// \ / -// [[LSTM]] -// / \ -// new_state1 activation0 -// -class LstmOperationParser : public TFLiteOperationParser { - public: - Status IsSupported(const TfLiteContext* context, - const TfLiteNode* tflite_node, - const TfLiteRegistration* registration) final { - RETURN_IF_ERROR(CheckExactSupportedOpVersion(registration, 2)); - // TODO(eignasheva): Fix bad check. - // RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/5, - // /*outputs=*/4)); - TfLiteLSTMParams* tf_options = nullptr; - RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); - RETURN_IF_ERROR(CheckParameters(tf_options)); - return OkStatus(); - } - - Status Parse(const TfLiteNode* tflite_node, - const TfLiteRegistration* registration, GraphFloat32* graph, - ObjectReader* reader) final { - if (tflite_node->inputs->size != 5) { - return InvalidArgumentError("LSTM should have 5 input tensors"); - } - if (tflite_node->outputs->size != 4) { - return InvalidArgumentError("LSTM should have 4 output tensors"); - } - - const auto* params = - reinterpret_cast(tflite_node->builtin_data); - if (!params) { - return InternalError("Missing tflite params"); - } - RETURN_IF_ERROR(CheckParameters(params)); - - Node* concat_node = graph->NewNode(); - concat_node->operation.type = ToString(OperationType::CONCAT); - ConcatAttributes concat_attr; - concat_attr.axis = Axis::CHANNELS; - concat_node->operation.attributes = concat_attr; - - Node* fc_node = graph->NewNode(); - fc_node->operation.type = ToString(OperationType::FULLY_CONNECTED); - FullyConnectedAttributes fc_attr; - RETURN_IF_ERROR(GetFullyConnectedAttributes(2, 3, reader, &fc_attr)); - fc_node->operation.attributes = std::move(fc_attr); - - Node* lstm_node = graph->NewNode(); - lstm_node->operation.type = ToString(OperationType::LSTM); - LstmAttributes lstm_attr; - lstm_attr.kernel_type = LstmKernelType::BASIC; - lstm_node->operation.attributes = lstm_attr; - - Value>* concat_temp; - int concat_tensor_idx = tflite_node->outputs->data[2]; - RETURN_IF_ERROR( - reader->ReadValueByTensorIdx(concat_tensor_idx, &concat_temp)); - Value>* activ_temp; - int activ_tensor_idx = tflite_node->outputs->data[3]; - RETURN_IF_ERROR( - reader->ReadValueByTensorIdx(activ_tensor_idx, &activ_temp)); - - RETURN_IF_ERROR(reader->AddInput(concat_node, 0)); // input - RETURN_IF_ERROR(reader->AddInput(concat_node, 1)); // prev_activ - RETURN_IF_ERROR(graph->SetProducer(concat_node->id, concat_temp->id)); - - RETURN_IF_ERROR(graph->AddConsumer(fc_node->id, concat_temp->id)); - RETURN_IF_ERROR(graph->SetProducer(fc_node->id, activ_temp->id)); - - RETURN_IF_ERROR(graph->AddConsumer(lstm_node->id, activ_temp->id)); - RETURN_IF_ERROR(reader->AddInput(lstm_node, 4)); // prev_state - RETURN_IF_ERROR(reader->AddOutput(lstm_node, 1)); // new_state - RETURN_IF_ERROR(reader->AddOutput(lstm_node, 0)); // activation - - return OkStatus(); - } - - private: - Status CheckParameters(const TfLiteLSTMParams* tf_options) { - if (tf_options->kernel_type != - TfLiteLSTMKernelType::kTfLiteLSTMBasicKernel) { - return UnimplementedError("Only kTfLiteLSTMBasicKernel is supported."); - } - if (tf_options->activation != kTfLiteActTanh) { - return UnimplementedError("Only TANH activation is supported."); - } - if (tf_options->cell_clip != 0.0f) { - return UnimplementedError("cell_clip is not supported."); - } - if (tf_options->proj_clip != 0.0f) { - return UnimplementedError("proj_clip is not supported."); - } - return OkStatus(); - } -}; - -class ResizeBilinearOperationParser : public TFLiteOperationParser { - public: - Status IsSupported(const TfLiteContext* context, - const TfLiteNode* tflite_node, - const TfLiteRegistration* registration) final { - RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); - RETURN_IF_ERROR( - CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1)); - - // TODO(eignasheva): check shapes. - TfLiteResizeBilinearParams* tf_options = nullptr; - RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); - return OkStatus(); - } - Status Parse(const TfLiteNode* tflite_node, - const TfLiteRegistration* registration, GraphFloat32* graph, - ObjectReader* reader) final { - Node* node = graph->NewNode(); - node->operation.type = ToString(OperationType::UPSAMPLE_2D); - RETURN_IF_ERROR(reader->AddInput(node, 0)); - RETURN_IF_ERROR(reader->AddOutputs(node)); - // Here we may have extra inputs. Other tensors were supposed to - // define new shape, but in TFLite these are ignored. - - const auto* tf_options = - reinterpret_cast( - tflite_node->builtin_data); - if (!tf_options) { - return InternalError("Missing tflite params"); - } - Upsample2DAttributes attr; - attr.align_corners = tf_options->align_corners; - attr.type = UpsamplingType::BILINEAR; - attr.new_shape.CopyAllDefinedAxis( - graph->FindOutputs(node->id)[0]->tensor.shape); - node->operation.attributes = attr; - return OkStatus(); - } -}; - -class PadOperationParser : public TFLiteOperationParser { - public: - Status IsSupported(const TfLiteContext* context, - const TfLiteNode* tflite_node, - const TfLiteRegistration* registration) final { - RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); - RETURN_IF_ERROR( - CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1)); - RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1)); - return OkStatus(); - } - Status Parse(const TfLiteNode* tflite_node, - const TfLiteRegistration* registration, GraphFloat32* graph, - ObjectReader* reader) final { - Node* node = graph->NewNode(); - node->operation.type = ToString(OperationType::PAD); - RETURN_IF_ERROR(reader->AddInput(node, 0)); - RETURN_IF_ERROR(reader->AddOutputs(node)); - - PadAttributes attr; - attr.type = PaddingContentType::ZEROS; - Tensor paddings; - RETURN_IF_ERROR(reader->ReadTensor(1, &paddings)); - - // 4x2 tensor with paddings. - if (paddings.shape.h != 4 || paddings.shape.w != 2) { - return InvalidArgumentError("Paddings tensor has unexpected shape."); - } - if (paddings.data[0] != 0 || paddings.data[1] != 0) { - return UnimplementedError("Padding for BATCH channel is not supported."); - } - attr.prepended = HWC(paddings.data[2], paddings.data[4], paddings.data[6]); - attr.appended = HWC(paddings.data[3], paddings.data[5], paddings.data[7]); - node->operation.attributes = attr; - return OkStatus(); - } -}; - class ElementwiseOperationParser : public TFLiteOperationParser { public: explicit ElementwiseOperationParser(OperationType operation_type) @@ -1482,97 +1143,202 @@ class ElementwiseOperationParser : public TFLiteOperationParser { OperationType operation_type_; }; -class PReLuOperationParser : public TFLiteOperationParser { +class FullyConnectedOperationParser : public TFLiteOperationParser { public: Status IsSupported(const TfLiteContext* context, const TfLiteNode* tflite_node, const TfLiteRegistration* registration) final { RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); - // TODO(eignasheva): add params check + TfLiteFullyConnectedParams* tf_options = nullptr; + RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); + if (tf_options->weights_format != + kTfLiteFullyConnectedWeightsFormatDefault) { + return UnimplementedError("Unsupported FullyConnected weights format."); + } + // TODO(eignasheva): check input shape return OkStatus(); } + Status Parse(const TfLiteNode* tflite_node, const TfLiteRegistration* registration, GraphFloat32* graph, ObjectReader* reader) final { Node* node = graph->NewNode(); - node->operation.type = ToString(OperationType::PRELU); RETURN_IF_ERROR(reader->AddInput(node, 0)); - auto input_shape = graph->FindInputs(node->id)[0]->tensor.shape; - PReLUAttributes attr; - Tensor linear_alpha; - Status status = reader->ReadTensor(1, &linear_alpha); - if (status.ok()) { - if (linear_alpha.shape.v != input_shape.c) { - return InvalidArgumentError( - "Linear alpha shape does not match the number of input channels."); - } - attr.alpha = std::move(linear_alpha); - } else { - Tensor hwc_alpha; - RETURN_IF_ERROR(reader->ReadTensor(1, &hwc_alpha)); - if (hwc_alpha.shape.h != input_shape.h || - hwc_alpha.shape.w != input_shape.w || - hwc_alpha.shape.c != input_shape.c) { - return InvalidArgumentError("Alpha shape does not match input shape."); - } - attr.alpha = std::move(hwc_alpha); + const auto* tf_options = + reinterpret_cast( + tflite_node->builtin_data); + if (tf_options->weights_format != + kTfLiteFullyConnectedWeightsFormatDefault) { + return UnimplementedError("Unsupported FullyConnected weights format."); } - node->operation.attributes = std::move(attr); + + FullyConnectedAttributes attr; + RETURN_IF_ERROR(GetFullyConnectedAttributes(1, 2, reader, &attr)); + + Tensor weights; + RETURN_IF_ERROR(reader->ReadTensor(1, &weights)); + auto input = graph->FindInputs(node->id)[0]; + int batch_size = input->tensor.shape.b; + if (input->tensor.shape.DimensionsProduct() / batch_size != + weights.shape.w) { + return UnimplementedError( + "Amount of input data should match weights width"); + } + + Node* conv = node; + if (input->tensor.shape.h != 1 || input->tensor.shape.w != 1) { + auto& reshape = node; + conv = graph->NewNode(); // reset conv pointer! + Value>* reshaped_value = graph->NewValue(); + reshaped_value->tensor.shape = BHWC(1, 1, 1, weights.shape.w); + RETURN_IF_ERROR(graph->SetProducer(reshape->id, reshaped_value->id)); + reshape->operation.type = ToString(OperationType::RESHAPE); + ReshapeAttributes attr; + attr.new_shape = reshaped_value->tensor.shape; + reshape->operation.attributes = attr; + RETURN_IF_ERROR(graph->AddConsumer(conv->id, reshaped_value->id)); + } + + conv->operation.type = ToString(OperationType::FULLY_CONNECTED); + conv->operation.attributes = std::move(attr); + Status result = reader->AddOutputs(conv); + RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation, + graph, conv)); + + return result; + } +}; + +class HardSwishOperationParser : public TFLiteOperationParser { + public: + Status IsSupported(const TfLiteContext* context, + const TfLiteNode* tflite_node, + const TfLiteRegistration*) final { + return CheckInputsOutputs(context, tflite_node, /*inputs=*/1, + /*outputs=*/1); + } + + Status Parse(const TfLiteNode*, const TfLiteRegistration*, + GraphFloat32* graph, ObjectReader* reader) final { + Node* node = graph->NewNode(); + node->operation.type = ToString(OperationType::HARD_SWISH); + RETURN_IF_ERROR(reader->AddInput(node, 0)); return reader->AddOutputs(node); } }; -class ReLuOperationParser : public TFLiteOperationParser { +// Basic LSTM Cell: +// +// 1name = name is at input index 1 +// name1 = name is at output index 1 +// +// 0input 1prev_activ +// \ / +// [[concat]] +// \ +// concat_temp2 2weights 3biases +// \ / / +// [[fully-connected]] +// \ +// activ_temp3 4prev_state +// \ / +// [[LSTM]] +// / \ +// new_state1 activation0 +// +class LSTMOperationParser : public TFLiteOperationParser { public: Status IsSupported(const TfLiteContext* context, const TfLiteNode* tflite_node, const TfLiteRegistration* registration) final { - RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); + RETURN_IF_ERROR(CheckExactSupportedOpVersion(registration, 2)); + // TODO(eignasheva): Fix bad check. + // RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/5, + // /*outputs=*/4)); + TfLiteLSTMParams* tf_options = nullptr; + RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); + RETURN_IF_ERROR(CheckParameters(tf_options)); return OkStatus(); } - explicit ReLuOperationParser(int clip) : clip_(clip) {} + Status Parse(const TfLiteNode* tflite_node, const TfLiteRegistration* registration, GraphFloat32* graph, ObjectReader* reader) final { - Node* node = graph->NewNode(); - node->operation.type = ToString(OperationType::RELU); - RETURN_IF_ERROR(reader->AddInput(node, 0)); + if (tflite_node->inputs->size != 5) { + return InvalidArgumentError("LSTM should have 5 input tensors"); + } + if (tflite_node->outputs->size != 4) { + return InvalidArgumentError("LSTM should have 4 output tensors"); + } - ReLUAttributes attr; - TfLiteLeakyReluParams* tf_options = nullptr; - RetrieveBuiltinData(tflite_node, &tf_options).IgnoreError(); - attr.alpha = tf_options ? tf_options->alpha : 0; - attr.clip = clip_; - node->operation.attributes = attr; - return reader->AddOutputs(node); + const auto* params = + reinterpret_cast(tflite_node->builtin_data); + if (!params) { + return InternalError("Missing tflite params"); + } + RETURN_IF_ERROR(CheckParameters(params)); + + Node* concat_node = graph->NewNode(); + concat_node->operation.type = ToString(OperationType::CONCAT); + ConcatAttributes concat_attr; + concat_attr.axis = Axis::CHANNELS; + concat_node->operation.attributes = concat_attr; + + Node* fc_node = graph->NewNode(); + fc_node->operation.type = ToString(OperationType::FULLY_CONNECTED); + FullyConnectedAttributes fc_attr; + RETURN_IF_ERROR(GetFullyConnectedAttributes(2, 3, reader, &fc_attr)); + fc_node->operation.attributes = std::move(fc_attr); + + Node* lstm_node = graph->NewNode(); + lstm_node->operation.type = ToString(OperationType::LSTM); + LstmAttributes lstm_attr; + lstm_attr.kernel_type = LstmKernelType::BASIC; + lstm_node->operation.attributes = lstm_attr; + + Value>* concat_temp; + int concat_tensor_idx = tflite_node->outputs->data[2]; + RETURN_IF_ERROR( + reader->ReadValueByTensorIdx(concat_tensor_idx, &concat_temp)); + Value>* activ_temp; + int activ_tensor_idx = tflite_node->outputs->data[3]; + RETURN_IF_ERROR( + reader->ReadValueByTensorIdx(activ_tensor_idx, &activ_temp)); + + RETURN_IF_ERROR(reader->AddInput(concat_node, 0)); // input + RETURN_IF_ERROR(reader->AddInput(concat_node, 1)); // prev_activ + RETURN_IF_ERROR(graph->SetProducer(concat_node->id, concat_temp->id)); + + RETURN_IF_ERROR(graph->AddConsumer(fc_node->id, concat_temp->id)); + RETURN_IF_ERROR(graph->SetProducer(fc_node->id, activ_temp->id)); + + RETURN_IF_ERROR(graph->AddConsumer(lstm_node->id, activ_temp->id)); + RETURN_IF_ERROR(reader->AddInput(lstm_node, 4)); // prev_state + RETURN_IF_ERROR(reader->AddOutput(lstm_node, 1)); // new_state + RETURN_IF_ERROR(reader->AddOutput(lstm_node, 0)); // activation + + return OkStatus(); } private: - int clip_; -}; - -Status ExtractTensorShape(const TfLiteTensor& tflite_tensor, BHWC* bhwc) { - const TfLiteIntArray* dims = tflite_tensor.dims; - switch (dims->size) { - case 1: - *bhwc = BHWC(dims->data[0], 1, 1, 1); - return OkStatus(); - case 2: - *bhwc = BHWC(dims->data[0], 1, 1, dims->data[1]); - return OkStatus(); - case 3: - *bhwc = BHWC(dims->data[0], 1, dims->data[1], dims->data[2]); - return OkStatus(); - case 4: - *bhwc = BHWC(dims->data[0], dims->data[1], dims->data[2], dims->data[3]); - return OkStatus(); - default: - return InvalidArgumentError(absl::StrCat( - "Tensor \"", tflite_tensor.name ? tflite_tensor.name : "nullptr", - "\" has bad input dims size: ", dims->size, ".")); + Status CheckParameters(const TfLiteLSTMParams* tf_options) { + if (tf_options->kernel_type != + TfLiteLSTMKernelType::kTfLiteLSTMBasicKernel) { + return UnimplementedError("Only kTfLiteLSTMBasicKernel is supported."); + } + if (tf_options->activation != kTfLiteActTanh) { + return UnimplementedError("Only TANH activation is supported."); + } + if (tf_options->cell_clip != 0.0f) { + return UnimplementedError("cell_clip is not supported."); + } + if (tf_options->proj_clip != 0.0f) { + return UnimplementedError("proj_clip is not supported."); + } + return OkStatus(); } -} +}; class MulOperationParser : public TFLiteOperationParser { public: @@ -1669,69 +1435,307 @@ class MulOperationParser : public TFLiteOperationParser { } }; -class FullyConnectedOperationParser : public TFLiteOperationParser { +class PReLUOperationParser : public TFLiteOperationParser { public: Status IsSupported(const TfLiteContext* context, const TfLiteNode* tflite_node, const TfLiteRegistration* registration) final { RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); - TfLiteFullyConnectedParams* tf_options = nullptr; - RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); - if (tf_options->weights_format != - kTfLiteFullyConnectedWeightsFormatDefault) { - return UnimplementedError("Unsupported FullyConnected weights format."); - } - // TODO(eignasheva): check input shape + // TODO(eignasheva): add params check return OkStatus(); } Status Parse(const TfLiteNode* tflite_node, const TfLiteRegistration* registration, GraphFloat32* graph, ObjectReader* reader) final { Node* node = graph->NewNode(); + node->operation.type = ToString(OperationType::PRELU); + RETURN_IF_ERROR(reader->AddInput(node, 0)); + auto input_shape = graph->FindInputs(node->id)[0]->tensor.shape; + + PReLUAttributes attr; + Tensor linear_alpha; + Status status = reader->ReadTensor(1, &linear_alpha); + if (status.ok()) { + if (linear_alpha.shape.v != input_shape.c) { + return InvalidArgumentError( + "Linear alpha shape does not match the number of input channels."); + } + attr.alpha = std::move(linear_alpha); + } else { + Tensor hwc_alpha; + RETURN_IF_ERROR(reader->ReadTensor(1, &hwc_alpha)); + if (hwc_alpha.shape.h != input_shape.h || + hwc_alpha.shape.w != input_shape.w || + hwc_alpha.shape.c != input_shape.c) { + return InvalidArgumentError("Alpha shape does not match input shape."); + } + attr.alpha = std::move(hwc_alpha); + } + node->operation.attributes = std::move(attr); + return reader->AddOutputs(node); + } +}; + +class PadOperationParser : public TFLiteOperationParser { + public: + Status IsSupported(const TfLiteContext* context, + const TfLiteNode* tflite_node, + const TfLiteRegistration* registration) final { + RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); + RETURN_IF_ERROR( + CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1)); + RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1)); + return OkStatus(); + } + + Status Parse(const TfLiteNode* tflite_node, + const TfLiteRegistration* registration, GraphFloat32* graph, + ObjectReader* reader) final { + Node* node = graph->NewNode(); + node->operation.type = ToString(OperationType::PAD); + RETURN_IF_ERROR(reader->AddInput(node, 0)); + RETURN_IF_ERROR(reader->AddOutputs(node)); + + PadAttributes attr; + attr.type = PaddingContentType::ZEROS; + Tensor paddings; + RETURN_IF_ERROR(reader->ReadTensor(1, &paddings)); + + // 4x2 tensor with paddings. + if (paddings.shape.h != 4 || paddings.shape.w != 2) { + return InvalidArgumentError("Paddings tensor has unexpected shape."); + } + if (paddings.data[0] != 0 || paddings.data[1] != 0) { + return UnimplementedError("Padding for BATCH channel is not supported."); + } + attr.prepended = HWC(paddings.data[2], paddings.data[4], paddings.data[6]); + attr.appended = HWC(paddings.data[3], paddings.data[5], paddings.data[7]); + node->operation.attributes = attr; + return OkStatus(); + } +}; + +class Pooling2DOperationParser : public TFLiteOperationParser { + public: + Status IsSupported(const TfLiteContext* context, + const TfLiteNode* tflite_node, + const TfLiteRegistration* registration) final { + RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); + TfLitePoolParams* tf_options = nullptr; + auto status = RetrieveCustomInitialData(tflite_node, &tf_options); + if (status.ok()) { // custom case with indices as a second output + RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/1, + /*outputs=*/2)); + } else { // common pooling with 1 output + RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); + RETURN_IF_ERROR(CheckInputsOutputs(context, tflite_node, /*inputs=*/1, + /*outputs=*/1)); + } + RETURN_IF_ERROR(CheckKernelsAndStrides( + tf_options->filter_height, tf_options->filter_width, + tf_options->stride_height, tf_options->stride_width)); + return IsActivationSupported(tf_options->activation); + } + + public: + explicit Pooling2DOperationParser(PoolingType type) : type_(type) {} + + Status Parse(const TfLiteNode* tflite_node, + const TfLiteRegistration* registration, GraphFloat32* graph, + ObjectReader* reader) final { + Node* node = graph->NewNode(); + node->operation.type = ToString(OperationType::POOLING_2D); + RETURN_IF_ERROR(reader->AddInput(node, 0)); + RETURN_IF_ERROR(reader->AddOutput(node, 0)); + + Pooling2DAttributes attr; + attr.type = type_; + + auto input_shape = graph->FindInputs(node->id)[0]->tensor.shape; + + // check whether there are custom options encoded. It happens if operation + // is MaxPoolingWithArgmax2D. There is no way to read + // tflite_node->builtin_code, so, simply check whether custom data is + // available. + auto* tf_options = reinterpret_cast( + tflite_node->custom_initial_data); + if (!tf_options) { + tf_options = + reinterpret_cast(tflite_node->builtin_data); + } + if (!tf_options) { + return InternalError("Missing tflite params"); + } + + std::vector max_tensor_id{0}; + RETURN_IF_ERROR(MaybeFuseActivation(tf_options->activation, max_tensor_id, + graph, node)); + // Second output is optional. It is not required, it but must be added after + // MaybeAddFusedActivation function is called + reader->AddOutput(node, 1).IgnoreError(); + + // First output is the result of pooling operation, while second output is + // indices used for pooling. + auto outputs = graph->FindOutputs(node->id); + attr.output_indices = outputs.size() == 2; + if (attr.output_indices) { + // Fix data type for output indices. In the model it is set as float32. + outputs[1]->tensor.type = DataType::INT32; + } + RETURN_IF_ERROR(ParsePoolingAttributes(tf_options, input_shape, &attr)); + node->operation.attributes = attr; + return OkStatus(); + } + + private: + const PoolingType type_; +}; + +class ReLUOperationParser : public TFLiteOperationParser { + public: + explicit ReLUOperationParser(int clip) : clip_(clip) {} + + Status IsSupported(const TfLiteContext* context, + const TfLiteNode* tflite_node, + const TfLiteRegistration* registration) final { + RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); + return OkStatus(); + } + + Status Parse(const TfLiteNode* tflite_node, + const TfLiteRegistration* registration, GraphFloat32* graph, + ObjectReader* reader) final { + Node* node = graph->NewNode(); + node->operation.type = ToString(OperationType::RELU); RETURN_IF_ERROR(reader->AddInput(node, 0)); + ReLUAttributes attr; + TfLiteLeakyReluParams* tf_options = nullptr; + RetrieveBuiltinData(tflite_node, &tf_options).IgnoreError(); + attr.alpha = tf_options ? tf_options->alpha : 0; + attr.clip = clip_; + node->operation.attributes = attr; + return reader->AddOutputs(node); + } + + private: + const int clip_; +}; + +class ReshapeOperationParser : public TFLiteOperationParser { + public: + Status IsSupported(const TfLiteContext* context, + const TfLiteNode* tflite_node, + const TfLiteRegistration* registration) final { + RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); + RETURN_IF_ERROR( + CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1)); + // TODO(eignasheva): add shape checking + return OkStatus(); + } + + Status Parse(const TfLiteNode* tflite_node, + const TfLiteRegistration* registration, GraphFloat32* graph, + ObjectReader* reader) final { + Node* node = graph->NewNode(); + node->operation.type = ToString(OperationType::RESHAPE); + RETURN_IF_ERROR(reader->AddInput(node, 0)); + RETURN_IF_ERROR(reader->AddOutputs(node)); + // Here we may have extra inputs. Other tensors were supposed to + // define new shape, but in TFLite these are ignored. + // TODO(akulik): check that shapes match? + + // New shape comes from output shape. + ReshapeAttributes attr; + attr.new_shape = graph->FindOutputs(node->id)[0]->tensor.shape; + node->operation.attributes = attr; + return OkStatus(); + } +}; + +class ResizeBilinearOperationParser : public TFLiteOperationParser { + public: + Status IsSupported(const TfLiteContext* context, + const TfLiteNode* tflite_node, + const TfLiteRegistration* registration) final { + RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); + RETURN_IF_ERROR( + CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1)); + + // TODO(eignasheva): check shapes. + TfLiteResizeBilinearParams* tf_options = nullptr; + RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); + return OkStatus(); + } + + Status Parse(const TfLiteNode* tflite_node, + const TfLiteRegistration* registration, GraphFloat32* graph, + ObjectReader* reader) final { + Node* node = graph->NewNode(); + node->operation.type = ToString(OperationType::UPSAMPLE_2D); + RETURN_IF_ERROR(reader->AddInput(node, 0)); + RETURN_IF_ERROR(reader->AddOutputs(node)); + // Here we may have extra inputs. Other tensors were supposed to + // define new shape, but in TFLite these are ignored. + const auto* tf_options = - reinterpret_cast( + reinterpret_cast( tflite_node->builtin_data); - if (tf_options->weights_format != - kTfLiteFullyConnectedWeightsFormatDefault) { - return UnimplementedError("Unsupported FullyConnected weights format."); + if (!tf_options) { + return InternalError("Missing tflite params"); } + Upsample2DAttributes attr; + attr.align_corners = tf_options->align_corners; + attr.type = UpsamplingType::BILINEAR; + attr.new_shape.CopyAllDefinedAxis( + graph->FindOutputs(node->id)[0]->tensor.shape); + node->operation.attributes = attr; + return OkStatus(); + } +}; - FullyConnectedAttributes attr; - RETURN_IF_ERROR(GetFullyConnectedAttributes(1, 2, reader, &attr)); - - Tensor weights; - RETURN_IF_ERROR(reader->ReadTensor(1, &weights)); - auto input = graph->FindInputs(node->id)[0]; - int batch_size = input->tensor.shape.b; - if (input->tensor.shape.DimensionsProduct() / batch_size != - weights.shape.w) { - return UnimplementedError( - "Amount of input data should match weights width"); +class SoftmaxOperationParser : public TFLiteOperationParser { + public: + Status IsSupported(const TfLiteContext* context, + const TfLiteNode* tflite_node, + const TfLiteRegistration* registration) final { + RETURN_IF_ERROR(CheckMaxSupportedOpVersion(registration, 1)); + RETURN_IF_ERROR( + CheckInputsOutputs(context, tflite_node, /*inputs=*/1, /*outputs=*/1)); + TfLiteSoftmaxParams* tf_options = nullptr; + RETURN_IF_ERROR(RetrieveBuiltinData(tflite_node, &tf_options)); + if (tf_options->beta != 1) { + // TODO(eignasheva): figure out, what's wrong with softmax. + return UnimplementedError("Softmax.beta != 1 is not supported."); } + return OkStatus(); + } - Node* conv = node; - if (input->tensor.shape.h != 1 || input->tensor.shape.w != 1) { - auto& reshape = node; - conv = graph->NewNode(); // reset conv pointer! - Value>* reshaped_value = graph->NewValue(); - reshaped_value->tensor.shape = BHWC(1, 1, 1, weights.shape.w); - RETURN_IF_ERROR(graph->SetProducer(reshape->id, reshaped_value->id)); - reshape->operation.type = ToString(OperationType::RESHAPE); - ReshapeAttributes attr; - attr.new_shape = reshaped_value->tensor.shape; - reshape->operation.attributes = attr; - RETURN_IF_ERROR(graph->AddConsumer(conv->id, reshaped_value->id)); + Status Parse(const TfLiteNode* tflite_node, + const TfLiteRegistration* registration, GraphFloat32* graph, + ObjectReader* reader) final { + Node* node = graph->NewNode(); + node->operation.type = ToString(OperationType::SOFT_MAX); + RETURN_IF_ERROR(reader->AddInput(node, 0)); + RETURN_IF_ERROR(reader->AddOutputs(node)); + + const auto* tf_options = + reinterpret_cast(tflite_node->builtin_data); + if (!tf_options) { + return InternalError("Missing tflite params"); } - - conv->operation.type = ToString(OperationType::FULLY_CONNECTED); - conv->operation.attributes = std::move(attr); - Status result = reader->AddOutputs(conv); - RETURN_IF_ERROR(MaybeFuseActivationToTheSingleOutput(tf_options->activation, - graph, conv)); - - return result; + if (tf_options->beta != 1) { + // there is multiply by scalar operation fused in softmax. Make a layer + // out of it before softmax. + return UnimplementedError("Softmax.beta != 1 is not supported."); + // auto mul_node = reader->NewPassthroughNode(node); + // mul_node->operation.type = ToString(OperationType::MUL); + } + // TODO(impjdi): Rename to SoftmaxAttributes. + SoftMaxAttributes attr; + attr.axis = Axis::CHANNELS; // always by channels + node->operation.attributes = attr; + return OkStatus(); } }; @@ -1746,6 +1750,7 @@ class StridedSliceOperationParser : public TFLiteOperationParser { RETURN_IF_ERROR(CheckOptionsSupport(tf_options)); return OkStatus(); } + Status Parse(const TfLiteNode* tflite_node, const TfLiteRegistration* registration, GraphFloat32* graph, ObjectReader* reader) final { @@ -1907,6 +1912,7 @@ class TransposeConvOperationParser : public TFLiteOperationParser { CheckStrides(tf_options->stride_height, tf_options->stride_width)); return OkStatus(); } + // TFLite's TRANSPOSE_CONV expects 3 input (output shape, weights, and input) // and allows configurable padding & stride. // TODO(impjdi): Translate output_shape to attr.adjacent. @@ -1940,85 +1946,49 @@ class TransposeConvOperationParser : public TFLiteOperationParser { } }; -class Convolution2DTransposeBiasParser : public TFLiteOperationParser { +class Unpooling2DOperationParser : public TFLiteOperationParser { public: Status IsSupported(const TfLiteContext* context, const TfLiteNode* tflite_node, const TfLiteRegistration* registration) final { - RETURN_IF_ERROR(CheckTensorIsAvailable(context, tflite_node, 1)); - TfLiteTransposeConvParams* tf_options = nullptr; - RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options)); + TfLitePoolParams* tf_options = nullptr; RETURN_IF_ERROR( - CheckStrides(tf_options->stride_height, tf_options->stride_width)); + CheckInputsOutputs(context, tflite_node, /*inputs=*/2, /*outputs=*/1)); + RETURN_IF_ERROR(RetrieveCustomInitialData(tflite_node, &tf_options)); + RETURN_IF_ERROR(CheckKernelsAndStrides( + tf_options->filter_height, tf_options->filter_width, + tf_options->stride_height, tf_options->stride_width)); return OkStatus(); } + Status Parse(const TfLiteNode* tflite_node, const TfLiteRegistration* registration, GraphFloat32* graph, ObjectReader* reader) final { - auto* node = graph->NewNode(); - node->operation.type = ToString(OperationType::CONVOLUTION_TRANSPOSED); + Node* node = graph->NewNode(); + node->operation.type = ToString(OperationType::MAX_UNPOOLING_2D); RETURN_IF_ERROR(reader->AddInput(node, 0)); + RETURN_IF_ERROR(reader->AddInput(node, 1)); RETURN_IF_ERROR(reader->AddOutputs(node)); - - const auto* params = reinterpret_cast( + auto input_shape = graph->FindInputs(node->id)[0]->tensor.shape; + MaxUnpooling2DAttributes attr; + const auto* tf_options = reinterpret_cast( tflite_node->custom_initial_data); - ConvolutionTransposedAttributes attr; - attr.stride = - params ? HW(params->stride_height, params->stride_width) : HW(1, 1); + if (!tf_options) { + return InternalError("Missing tflite params"); + } + attr.kernel = ToHW(tf_options->filter_height, tf_options->filter_width); + attr.strides = ToHW(tf_options->stride_height, tf_options->stride_width); + UpdatePadding(tf_options->padding, input_shape, &attr); - RETURN_IF_ERROR(reader->ReadTensor(1, &attr.weights)); - reader->ReadTensor(2, &attr.bias).IgnoreError(); // bias is optional + node->operation.attributes = attr; - UpdatePadding(params->padding, graph->FindInputs(node->id)[0]->tensor.shape, - &attr); - - node->operation.attributes = std::move(attr); - return OkStatus(); - } -}; - -class SpaceToBatchOperationParser : public TFLiteOperationParser { - public: - Status IsSupported(const TfLiteContext* context, - const TfLiteNode* tflite_node, - const TfLiteRegistration* registration) final { - return OkStatus(); - } - Status Parse(const TfLiteNode* tflite_node, - const TfLiteRegistration* registration, GraphFloat32* graph, - ObjectReader* reader) final { - auto* node = graph->NewNode(); - node->operation.type = ToString(OperationType::SPACE_TO_BATCH); - RETURN_IF_ERROR(reader->AddInput(node, 0)); - RETURN_IF_ERROR(reader->AddOutputs(node)); - SpaceToBatchAttributes sb_attr; - Tensor block; - RETURN_IF_ERROR(reader->ReadTensor(1, &block)); - if (block.shape.v != 2) { - return InternalError("Space has to be HxW."); - } - sb_attr.block.h = block.data[0]; - sb_attr.block.w = block.data[1]; - - Tensor padding; - RETURN_IF_ERROR(reader->ReadTensor(2, &padding)); - auto padding_shape = padding.shape; - - if (padding_shape.h != 2 && padding_shape.w != 2) { - return InternalError("Space has to be HxW."); - } - - sb_attr.padding.prepended.h = padding.data[0]; - sb_attr.padding.prepended.w = padding.data[2]; - - sb_attr.padding.appended.h = padding.data[1]; - sb_attr.padding.appended.w = padding.data[3]; - - node->operation.attributes = std::move(sb_attr); + auto output_value = graph->FindOutputs(node->id)[0]; + output_value->tensor.shape = CalculateOutputShape(input_shape, attr); return OkStatus(); } }; +// TODO(impjdi): BATCH_TO_SPACE/SPACE_TO_BATCH shouldn't be supported. class BatchToSpaceOperationParser : public TFLiteOperationParser { public: Status IsSupported(const TfLiteContext* context, @@ -2026,6 +1996,7 @@ class BatchToSpaceOperationParser : public TFLiteOperationParser { const TfLiteRegistration* registration) final { return OkStatus(); } + Status Parse(const TfLiteNode* tflite_node, const TfLiteRegistration* registration, GraphFloat32* graph, ObjectReader* reader) final { @@ -2061,7 +2032,51 @@ class BatchToSpaceOperationParser : public TFLiteOperationParser { } }; +class SpaceToBatchOperationParser : public TFLiteOperationParser { + public: + Status IsSupported(const TfLiteContext* context, + const TfLiteNode* tflite_node, + const TfLiteRegistration* registration) final { + return OkStatus(); + } + + Status Parse(const TfLiteNode* tflite_node, + const TfLiteRegistration* registration, GraphFloat32* graph, + ObjectReader* reader) final { + auto* node = graph->NewNode(); + node->operation.type = ToString(OperationType::SPACE_TO_BATCH); + RETURN_IF_ERROR(reader->AddInput(node, 0)); + RETURN_IF_ERROR(reader->AddOutputs(node)); + SpaceToBatchAttributes sb_attr; + Tensor block; + RETURN_IF_ERROR(reader->ReadTensor(1, &block)); + if (block.shape.v != 2) { + return InternalError("Space has to be HxW."); + } + sb_attr.block.h = block.data[0]; + sb_attr.block.w = block.data[1]; + + Tensor padding; + RETURN_IF_ERROR(reader->ReadTensor(2, &padding)); + auto padding_shape = padding.shape; + + if (padding_shape.h != 2 && padding_shape.w != 2) { + return InternalError("Space has to be HxW."); + } + + sb_attr.padding.prepended.h = padding.data[0]; + sb_attr.padding.prepended.w = padding.data[2]; + + sb_attr.padding.appended.h = padding.data[1]; + sb_attr.padding.appended.w = padding.data[3]; + + node->operation.attributes = std::move(sb_attr); + return OkStatus(); + } +}; + class UnsupportedOperationParser : public TFLiteOperationParser { + public: Status IsSupported(const TfLiteContext* context, const TfLiteNode* tflite_node, const TfLiteRegistration* registration) final { @@ -2105,7 +2120,7 @@ std::unique_ptr NewOperationParser( case kTfLiteBuiltinLog: return make_unique(OperationType::LOG); case kTfLiteBuiltinLstm: - return make_unique(); + return make_unique(); case kTfLiteBuiltinMaxPool2d: return make_unique(PoolingType::MAX); case kTfLiteBuiltinMul: @@ -2115,13 +2130,13 @@ std::unique_ptr NewOperationParser( case kTfLiteBuiltinPow: return make_unique(OperationType::POW); case kTfLiteBuiltinRelu: - return make_unique(0); + return make_unique(0); case kTfLiteBuiltinRelu6: - return make_unique(6); + return make_unique(6); case kTfLiteBuiltinLeakyRelu: - return make_unique(0); + return make_unique(0); case kTfLiteBuiltinPrelu: - return make_unique(); + return make_unique(); case kTfLiteBuiltinReshape: return make_unique(); case kTfLiteBuiltinResizeBilinear: @@ -2131,7 +2146,7 @@ std::unique_ptr NewOperationParser( case kTfLiteBuiltinSin: return make_unique(OperationType::SIN); case kTfLiteBuiltinSoftmax: - return make_unique(); + return make_unique(); case kTfLiteBuiltinStridedSlice: return make_unique(); case kTfLiteBuiltinSqrt: From 1f7959a055d3f72bc8a3738b13ca795d9de9ada0 Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Mon, 22 Jul 2019 12:14:04 -0700 Subject: [PATCH 0322/3053] Specialize the handling of common DataType-valued attrs in op creation. 1. Avoid the use of proto serialization to get a type-valued attr from a created node. 2. Avoid the use of proto serialization to compare type-valued attrs when setting a pre-existing attr. This method uses fewer C API calls and dynamic allocations to access dtype-valued attrs. This path is particularly heavily exercised in graph-building code, as we fetch all the attrs of every created op, and we redundantly set any type-valued attrs whose type can be inferred from the inputs. PiperOrigin-RevId: 259378863 --- tensorflow/core/framework/attr_value_util.cc | 6 ++++++ tensorflow/python/client/tf_session.i | 8 ++++++++ tensorflow/python/framework/ops.py | 20 ++++++++++++++++++++ tensorflow/python/framework/python_op_gen.cc | 9 +++++++-- 4 files changed, 41 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc index ed7caaa6c0b..1eafd292f0f 100644 --- a/tensorflow/core/framework/attr_value_util.cc +++ b/tensorflow/core/framework/attr_value_util.cc @@ -152,6 +152,12 @@ uint64 AttrValueHash(const AttrValue& a, const TensorProtoHasher& tensor_hash) { bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b, const TensorProtosEquality& tensor_equality) { + if (a.type() != b.type()) { + return false; + } else if (a.type() != DT_INVALID && b.type() != DT_INVALID) { + return a.type() == b.type(); + } + if (a.has_tensor() != b.has_tensor()) { return false; } else if (a.has_tensor() && b.has_tensor()) { diff --git a/tensorflow/python/client/tf_session.i b/tensorflow/python/client/tf_session.i index 70de97835d3..763e1afdd1a 100644 --- a/tensorflow/python/client/tf_session.i +++ b/tensorflow/python/client/tf_session.i @@ -143,6 +143,14 @@ tensorflow::ImportNumpy(); $result = PyLong_FromUnsignedLongLong($1); } +// Convert TF_OperationGetAttrType TF_DataType* out-argument to Python integer. +%typemap(in, numinputs=0) TF_DataType *value (TF_DataType temp) { + $1 = &temp; +} +%typemap(argout) TF_DataType *value { + $result = PyInt_FromLong(*$1); +} + // We use TF_OperationGetControlInputs_wrapper instead of // TF_OperationGetControlInputs %ignore TF_OperationGetControlInputs; diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index d19646fc69e..e4a68e08ab0 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -81,6 +81,7 @@ _api_usage_gauge = monitoring.BoolGauge( # pylint: disable=protected-access _TensorLike = tensor_like._TensorLike +_DTYPES_INTERN_TABLE = dtypes._INTERN_TABLE # pylint: enable=protected-access @@ -2314,6 +2315,25 @@ class Operation(object): assert oneof_value in fields, "Unsupported field type in " + str(x) return getattr(x, oneof_value) + def _get_attr_type(self, name): + """Returns the value of the attr of this op with the given `name`. + + Args: + name: The name of the attr to fetch. + + Returns: + The value of the attr, as a Python object. + + Raises: + ValueError: If this op does not have an attr with the given `name`. + """ + try: + dtype_enum = c_api.TF_OperationGetAttrType(self._c_op, name) + return _DTYPES_INTERN_TABLE[dtype_enum] + except errors.InvalidArgumentError as e: + # Convert to ValueError for backwards compatibility. + raise ValueError(str(e)) + def run(self, feed_dict=None, session=None): """Runs this operation in a `Session`. diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc index d45428d7ca0..75dfb84ce24 100644 --- a/tensorflow/python/framework/python_op_gen.cc +++ b/tensorflow/python/framework/python_op_gen.cc @@ -391,8 +391,13 @@ void GenEagerPythonOp::HandleGraphMode(const string& function_setup) { for (int i = 0; i < op_def_.attr_size(); ++i) { if (i > 0) strings::StrAppend(&attr_values, ", "); const auto& attr_name(op_def_.attr(i).name()); - strings::StrAppend(&attr_values, "\"", attr_name, "\", _op.get_attr(\"", - attr_name, "\")"); + if (op_def_.attr(i).type() == "type") { + strings::StrAppend(&attr_values, "\"", attr_name, + "\", _op._get_attr_type(\"", attr_name, "\")"); + } else { + strings::StrAppend(&attr_values, "\"", attr_name, + "\", _op.get_attr(\"", attr_name, "\")"); + } } strings::StrAppend(&attr_values, ")"); strings::StrAppend( From 5334adcddb1009ae68316c661f3a40b8c8ff9f5e Mon Sep 17 00:00:00 2001 From: Penporn Koanantakool Date: Mon, 22 Jul 2019 12:19:25 -0700 Subject: [PATCH 0323/3053] Add XLA implementations for MatrixDiagV2, MatrixDiagPartV2, and MatrixSetDiagV2. PiperOrigin-RevId: 259379918 --- tensorflow/compiler/tests/BUILD | 13 + tensorflow/compiler/tests/binary_ops_test.py | 48 -- .../compiler/tests/matrix_diag_ops_test.py | 655 ++++++++++++++++++ tensorflow/compiler/tests/unary_ops_test.py | 26 - tensorflow/compiler/tf2xla/kernels/BUILD | 2 +- tensorflow/compiler/tf2xla/kernels/diag_op.cc | 49 +- .../tf2xla/kernels/matrix_diag_ops.cc | 425 ++++++++++++ .../tf2xla/kernels/matrix_set_diag_op.cc | 98 --- 8 files changed, 1096 insertions(+), 220 deletions(-) create mode 100644 tensorflow/compiler/tests/matrix_diag_ops_test.py create mode 100644 tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc delete mode 100644 tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD index 15bb0a863d1..d39d15986be 100644 --- a/tensorflow/compiler/tests/BUILD +++ b/tensorflow/compiler/tests/BUILD @@ -665,6 +665,19 @@ tf_xla_py_test( ], ) +tf_xla_py_test( + name = "matrix_diag_ops_test", + size = "medium", + timeout = "long", + srcs = ["matrix_diag_ops_test.py"], + deps = [ + ":xla_test", + "//tensorflow/python:array_ops", + "//tensorflow/python:dtypes", + "//tensorflow/python:platform_test", + ], +) + tf_xla_py_test( name = "momentum_test", size = "small", diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py index 0171be42148..14af571d62f 100644 --- a/tensorflow/compiler/tests/binary_ops_test.py +++ b/tensorflow/compiler/tests/binary_ops_test.py @@ -28,7 +28,6 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.ops import array_ops from tensorflow.python.ops import bitwise_ops -from tensorflow.python.ops import gen_array_ops from tensorflow.python.ops import gen_math_ops from tensorflow.python.ops import gen_nn_ops from tensorflow.python.ops import math_ops @@ -1464,53 +1463,6 @@ class BinaryOpsTest(xla_test.XLATestCase): np.array([4, 5, 6], dtype=np.int32), expected=None) - def testMatrixSetDiag(self): - # TODO(penporn): Once XLA supports MatrixSetDiagV2, change the call to - # gen_array_ops.matrix_set_diag (V1) to array_ops.matrix_set_diag (V2). - for dtype in self.numeric_types: - # Square - self._testBinary( - gen_array_ops.matrix_set_diag, - np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0], [1.0, 1.0, 1.0]], - dtype=dtype), - np.array([1.0, 2.0, 3.0], dtype=dtype), - expected=np.array([[1.0, 1.0, 0.0], [1.0, 2.0, 1.0], [1.0, 1.0, 3.0]], - dtype=dtype)) - - self._testBinary( - gen_array_ops.matrix_set_diag, - np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0], [1.0, 0.0, 3.0]], - [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0], [2.0, 0.0, 6.0]]], - dtype=dtype), - np.array([[-1.0, 0.0, -3.0], [-4.0, -5.0, -6.0]], dtype=dtype), - expected=np.array( - [[[-1.0, 0.0, 3.0], [0.0, 0.0, 0.0], [1.0, 0.0, -3.0]], - [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0], [2.0, 0.0, -6.0]]], - dtype=dtype)) - - # Rectangular - self._testBinary( - gen_array_ops.matrix_set_diag, - np.array([[0.0, 1.0, 0.0], [1.0, 0.0, 1.0]], dtype=dtype), - np.array([3.0, 4.0], dtype=dtype), - expected=np.array([[3.0, 1.0, 0.0], [1.0, 4.0, 1.0]], dtype=dtype)) - - self._testBinary( - gen_array_ops.matrix_set_diag, - np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 1.0]], dtype=dtype), - np.array([3.0, 4.0], dtype=dtype), - expected=np.array([[3.0, 1.0], [1.0, 4.0], [1.0, 1.0]], dtype=dtype)) - - self._testBinary( - gen_array_ops.matrix_set_diag, - np.array([[[1.0, 0.0, 3.0], [0.0, 2.0, 0.0]], - [[4.0, 0.0, 4.0], [0.0, 5.0, 0.0]]], dtype=dtype), - np.array([[-1.0, -2.0], [-4.0, -5.0]], - dtype=dtype), - expected=np.array([[[-1.0, 0.0, 3.0], [0.0, -2.0, 0.0]], - [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]], - dtype=dtype)) - def testBroadcastTo(self): for dtype in self.all_types: x = np.random.randint(0, high=100, size=[2, 3]) diff --git a/tensorflow/compiler/tests/matrix_diag_ops_test.py b/tensorflow/compiler/tests/matrix_diag_ops_test.py new file mode 100644 index 00000000000..a994be8b29d --- /dev/null +++ b/tensorflow/compiler/tests/matrix_diag_ops_test.py @@ -0,0 +1,655 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for XLA matrix diag ops.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np + +from tensorflow.compiler.tests import xla_test +from tensorflow.python.compat import compat +from tensorflow.python.ops import array_ops +from tensorflow.python.platform import googletest + + +# Test cases shared by MatrixDiagV2, MatrixDiagPartV2, and MatrixSetDiagV2. +# Copied from //third_party/tensorflow/python/kernel_tests/diag_op_test.py +def square_cases(): + # pyformat: disable + mat = np.array([[[1, 2, 3, 4, 5], + [6, 7, 8, 9, 1], + [3, 4, 5, 6, 7], + [8, 9, 1, 2, 3], + [4, 5, 6, 7, 8]], + [[9, 1, 2, 3, 4], + [5, 6, 7, 8, 9], + [1, 2, 3, 4, 5], + [6, 7, 8, 9, 1], + [2, 3, 4, 5, 6]]]) + tests = dict() + # tests[d_lower, d_upper] = (compact_diagonals, padded_diagnals) + tests[-1, -1] = (np.array([[6, 4, 1, 7], + [5, 2, 8, 5]]), + np.array([[[0, 0, 0, 0, 0], + [6, 0, 0, 0, 0], + [0, 4, 0, 0, 0], + [0, 0, 1, 0, 0], + [0, 0, 0, 7, 0]], + [[0, 0, 0, 0, 0], + [5, 0, 0, 0, 0], + [0, 2, 0, 0, 0], + [0, 0, 8, 0, 0], + [0, 0, 0, 5, 0]]])) + tests[-4, -3] = (np.array([[[8, 5], + [4, 0]], + [[6, 3], + [2, 0]]]), + np.array([[[0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [8, 0, 0, 0, 0], + [4, 5, 0, 0, 0]], + [[0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0], + [6, 0, 0, 0, 0], + [2, 3, 0, 0, 0]]])) + tests[-2, 1] = (np.array([[[2, 8, 6, 3, 0], + [1, 7, 5, 2, 8], + [6, 4, 1, 7, 0], + [3, 9, 6, 0, 0]], + [[1, 7, 4, 1, 0], + [9, 6, 3, 9, 6], + [5, 2, 8, 5, 0], + [1, 7, 4, 0, 0]]]), + np.array([[[1, 2, 0, 0, 0], + [6, 7, 8, 0, 0], + [3, 4, 5, 6, 0], + [0, 9, 1, 2, 3], + [0, 0, 6, 7, 8]], + [[9, 1, 0, 0, 0], + [5, 6, 7, 0, 0], + [1, 2, 3, 4, 0], + [0, 7, 8, 9, 1], + [0, 0, 4, 5, 6]]])) + tests[2, 4] = (np.array([[[5, 0, 0], + [4, 1, 0], + [3, 9, 7]], + [[4, 0, 0], + [3, 9, 0], + [2, 8, 5]]]), + np.array([[[0, 0, 3, 4, 5], + [0, 0, 0, 9, 1], + [0, 0, 0, 0, 7], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]], + [[0, 0, 2, 3, 4], + [0, 0, 0, 8, 9], + [0, 0, 0, 0, 5], + [0, 0, 0, 0, 0], + [0, 0, 0, 0, 0]]])) + # pyformat: enable + return (mat, tests) + + +def tall_cases(): + # pyformat: disable + mat = np.array([[[1, 2, 3], + [4, 5, 6], + [7, 8, 9], + [9, 8, 7], + [6, 5, 4]], + [[3, 2, 1], + [1, 2, 3], + [4, 5, 6], + [7, 8, 9], + [9, 8, 7]]]) + tests = dict() + # tests[d_lower, d_upper] = (compact_diagonals, padded_diagnals) + tests[0, 0] = (np.array([[1, 5, 9], + [3, 2, 6]]), + np.array([[[1, 0, 0], + [0, 5, 0], + [0, 0, 9], + [0, 0, 0]], + [[3, 0, 0], + [0, 2, 0], + [0, 0, 6], + [0, 0, 0]]])) + tests[-4, -3] = (np.array([[[9, 5], + [6, 0]], + [[7, 8], + [9, 0]]]), + np.array([[[0, 0, 0], + [0, 0, 0], + [0, 0, 0], + [9, 0, 0], + [6, 5, 0]], + [[0, 0, 0], + [0, 0, 0], + [0, 0, 0], + [7, 0, 0], + [9, 8, 0]]])) + tests[-2, -1] = (np.array([[[4, 8, 7], + [7, 8, 4]], + [[1, 5, 9], + [4, 8, 7]]]), + np.array([[[0, 0, 0], + [4, 0, 0], + [7, 8, 0], + [0, 8, 7], + [0, 0, 4]], + [[0, 0, 0], + [1, 0, 0], + [4, 5, 0], + [0, 8, 9], + [0, 0, 7]]])) + tests[-2, 1] = (np.array([[[2, 6, 0], + [1, 5, 9], + [4, 8, 7], + [7, 8, 4]], + [[2, 3, 0], + [3, 2, 6], + [1, 5, 9], + [4, 8, 7]]]), + np.array([[[1, 2, 0], + [4, 5, 6], + [7, 8, 9], + [0, 8, 7], + [0, 0, 4]], + [[3, 2, 0], + [1, 2, 3], + [4, 5, 6], + [0, 8, 9], + [0, 0, 7]]])) + tests[1, 2] = (np.array([[[3, 0], + [2, 6]], + [[1, 0], + [2, 3]]]), + np.array([[[0, 2, 3], + [0, 0, 6], + [0, 0, 0], + [0, 0, 0], + [0, 0, 0]], + [[0, 2, 1], + [0, 0, 3], + [0, 0, 0], + [0, 0, 0], + [0, 0, 0]]])) + # pyformat: enable + return (mat, tests) + + +def fat_cases(): + # pyformat: disable + mat = np.array([[[1, 2, 3, 4], + [5, 6, 7, 8], + [9, 1, 2, 3]], + [[4, 5, 6, 7], + [8, 9, 1, 2], + [3, 4, 5, 6]]]) + tests = dict() + # tests[d_lower, d_upper] = (compact_diagonals, padded_diagnals) + tests[0, 0] = (np.array([[1, 6, 2], + [4, 9, 5]]), + np.array([[[1, 0, 0, 0], + [0, 6, 0, 0], + [0, 0, 2, 0]], + [[4, 0, 0, 0], + [0, 9, 0, 0], + [0, 0, 5, 0]]])) + tests[2, 2] = (np.array([[3, 8], + [6, 2]]), + np.array([[[0, 0, 3, 0], + [0, 0, 0, 8], + [0, 0, 0, 0]], + [[0, 0, 6, 0], + [0, 0, 0, 2], + [0, 0, 0, 0]]])) + tests[-2, 0] = (np.array([[[1, 6, 2], + [5, 1, 0], + [9, 0, 0]], + [[4, 9, 5], + [8, 4, 0], + [3, 0, 0]]]), + np.array([[[1, 0, 0, 0], + [5, 6, 0, 0], + [9, 1, 2, 0]], + [[4, 0, 0, 0], + [8, 9, 0, 0], + [3, 4, 5, 0]]])) + tests[-1, 1] = (np.array([[[2, 7, 3], + [1, 6, 2], + [5, 1, 0]], + [[5, 1, 6], + [4, 9, 5], + [8, 4, 0]]]), + np.array([[[1, 2, 0, 0], + [5, 6, 7, 0], + [0, 1, 2, 3]], + [[4, 5, 0, 0], + [8, 9, 1, 0], + [0, 4, 5, 6]]])) + tests[0, 3] = (np.array([[[4, 0, 0], + [3, 8, 0], + [2, 7, 3], + [1, 6, 2]], + [[7, 0, 0], + [6, 2, 0], + [5, 1, 6], + [4, 9, 5]]]), + np.array([[[1, 2, 3, 4], + [0, 6, 7, 8], + [0, 0, 2, 3]], + [[4, 5, 6, 7], + [0, 9, 1, 2], + [0, 0, 5, 6]]])) + # pyformat: enable + return (mat, tests) + + +class MatrixDiagTest(xla_test.XLATestCase): + + def _assertOpOutputMatchesExpected(self, + params, + solution, + rtol=1e-3, + atol=1e-5): + """Verifies that matrix_diag produces `solution` when fed `params`. + + Args: + params: dictionary containing input parameters to matrix_diag. + solution: numpy array representing the expected output of matrix_diag. + rtol: relative tolerance for equality test. + atol: absolute tolerance for equality test. + """ + diagonal = params["diagonal"] + with self.session() as session: + for dtype in self.numeric_types - {np.int8, np.uint8}: + expected = solution.astype(dtype) + with self.test_scope(): + params["diagonal"] = array_ops.placeholder( + dtype, diagonal.shape, name="diagonal") + output = array_ops.matrix_diag(**params) + result = session.run(output, + {params["diagonal"]: diagonal.astype(dtype)}) + self.assertEqual(output.dtype, expected.dtype) + self.assertAllCloseAccordingToType( + expected, result, rtol=rtol, atol=atol, bfloat16_rtol=0.03) + + # Generic tests applicable to both v1 and v2 ops. + # Originally from unary_ops_tests.py. + def testV1(self): + # pyformat: disable + vecs1 = np.array([[1, 2], + [3, 4]]) + solution1 = np.array([[[1, 0], [0, 2]], + [[3, 0], [0, 4]]]) + vecs2 = np.array([1, 2, 3, 4]) + solution2 = np.array([[1, 0, 0, 0], + [0, 2, 0, 0], + [0, 0, 3, 0], + [0, 0, 0, 4]]) + vecs3 = np.array([[[1, 2, 3], + [4, 5, 6]], + [[7, 8, 9], # pylint: disable=bad-whitespace + [10, 11, 12]]]) + solution3 = np.array([[[[1, 0, 0], + [0, 2, 0], + [0, 0, 3]], + [[4, 0, 0], + [0, 5, 0], + [0, 0, 6]]], + [[[7, 0, 0], + [0, 8, 0], + [0, 0, 9]], + [[10, 0, 0], + [0, 11, 0], + [0, 0, 12]]]]) + # pyformat: enable + self._assertOpOutputMatchesExpected({"diagonal": vecs1}, solution1) + self._assertOpOutputMatchesExpected({"diagonal": vecs2}, solution2) + self._assertOpOutputMatchesExpected({"diagonal": vecs3}, solution3) + + # From here onwards are v2-only tests. + def testSquare(self): + # LINT.IfChange + if compat.forward_compatible(2019, 7, 31): + # LINT.ThenChange(//tensorflow/python/ops/array_ops.py) + for _, tests in [square_cases()]: + for diag_index, (vecs, solution) in tests.items(): + self._assertOpOutputMatchesExpected( + { + "diagonal": vecs[0], + "k": diag_index + }, solution[0]) + + def testSquareBatch(self): + # LINT.IfChange + if compat.forward_compatible(2019, 7, 31): + # LINT.ThenChange(//tensorflow/python/ops/array_ops.py) + for _, tests in [square_cases()]: + for diag_index, (vecs, solution) in tests.items(): + self._assertOpOutputMatchesExpected( + { + "diagonal": vecs, + "k": diag_index + }, solution) + + def testRectangularBatch(self): + # LINT.IfChange + if not compat.forward_compatible(2019, 7, 31): + # LINT.ThenChange(//tensorflow/python/ops/array_ops.py) + return + + # Stores expected num_rows and num_cols (when the other is given). + # expected[(d_lower, d_upper)] = (expected_num_rows, expected_num_cols) + test_list = list() + + # Square cases: + expected = { + (-1, -1): (5, 4), + (-4, -3): (5, 2), + (-2, 1): (5, 5), + (2, 4): (3, 5), + } + test_list.append((expected, square_cases())) + + # Tall cases + expected = { + (0, 0): (3, 3), + (-4, -3): (5, 2), + (-2, -1): (4, 3), + (-2, 1): (3, 3), + (1, 2): (2, 3) + } + test_list.append((expected, tall_cases())) + + # Fat cases + expected = { + (2, 2): (2, 4), + (-2, 0): (3, 3), + (-1, 1): (3, 3), + (0, 3): (3, 3) + } + test_list.append((expected, fat_cases())) + + # Giving both num_rows and num_cols + for _, tests in [tall_cases(), fat_cases()]: + for diag_index, (vecs, solution) in tests.items(): + self._assertOpOutputMatchesExpected( + { + "diagonal": vecs, + "k": diag_index, + "num_rows": solution.shape[-2], + "num_cols": solution.shape[-1] + }, solution) + + # Giving just num_rows or num_cols. + for expected, (_, tests) in test_list: + for diag_index, (new_num_rows, new_num_cols) in expected.items(): + vecs, solution = tests[diag_index] + solution_given_num_rows = solution.take( + indices=range(new_num_cols), axis=-1) + self._assertOpOutputMatchesExpected( + { + "diagonal": vecs, + "k": diag_index, + "num_rows": solution_given_num_rows.shape[-2] + }, solution_given_num_rows) + solution_given_num_cols = solution.take( + indices=range(new_num_rows), axis=-2) + self._assertOpOutputMatchesExpected( + { + "diagonal": vecs, + "k": diag_index, + "num_cols": solution_given_num_cols.shape[-1] + }, solution_given_num_cols) + + def testPadding(self): + # LINT.IfChange + if compat.forward_compatible(2019, 7, 31): + # LINT.ThenChange(//tensorflow/python/ops/array_ops.py) + for padding_value in [555, -11]: + for _, tests in [square_cases(), tall_cases(), fat_cases()]: + for diag_index, (vecs, solution) in tests.items(): + mask = (solution == 0) + solution = solution + (mask * padding_value) + self._assertOpOutputMatchesExpected( + { + "diagonal": vecs, + "k": diag_index, + "num_rows": solution.shape[-2], + "num_cols": solution.shape[-1], + "padding_value": padding_value + }, solution) + + +class MatrixSetDiagTest(xla_test.XLATestCase): + + def _assertOpOutputMatchesExpected(self, + params, + solution, + rtol=1e-3, + atol=1e-5): + """Verifies that matrix_set_diag produces `solution` when fed `params`. + + Args: + params: dictionary containing input parameters to matrix_set_diag. + solution: numpy array representing the expected output of matrix_set_diag. + rtol: relative tolerance for equality test. + atol: absolute tolerance for equality test. + """ + input = params["input"] # pylint: disable=redefined-builtin + diagonal = params["diagonal"] + with self.session() as session: + for dtype in self.numeric_types - {np.int8, np.uint8}: + expected = solution.astype(dtype) + with self.test_scope(): + params["input"] = array_ops.placeholder( + dtype, input.shape, name="input") + params["diagonal"] = array_ops.placeholder( + dtype, diagonal.shape, name="diagonal") + output = array_ops.matrix_set_diag(**params) + result = session.run( + output, { + params["input"]: input.astype(dtype), + params["diagonal"]: diagonal.astype(dtype) + }) + self.assertEqual(output.dtype, expected.dtype) + self.assertAllCloseAccordingToType( + expected, result, rtol=rtol, atol=atol, bfloat16_rtol=0.03) + + # Generic tests applicable to both v1 and v2 ops. + # Originally from binary_ops_tests.py. + def testV1(self): + test_cases = list() + + # pyformat: disable + # pylint: disable=bad-whitespace + # Square cases. + input = np.array([[0, 1, 0], # pylint: disable=redefined-builtin + [1, 0, 1], + [1, 1, 1]]) + diag = np.array([1, 2, 3]) + solution = np.array([[1, 1, 0], + [1, 2, 1], + [1, 1, 3]]) + test_cases.append(({"input": input, "diagonal": diag}, solution)) + + input = np.array([[[1, 0, 3], + [0, 2, 0], + [1, 0, 3]], + [[4, 0, 4], + [0, 5, 0], + [2, 0, 6]]]) + diag = np.array([[-1, 0, -3], + [-4, -5, -6]]) + solution = np.array([[[-1, 0, 3], + [ 0, 0, 0], + [ 1, 0, -3]], + [[-4, 0, 4], + [ 0, -5, 0], + [ 2, 0, -6]]]) + test_cases.append(({"input": input, "diagonal": diag}, solution)) + + # Rectangular cases. + input = np.array([[0, 1, 0], + [1, 0, 1]]) + diag = np.array([3, 4]) + solution = np.array([[3, 1, 0], + [1, 4, 1]]) + test_cases.append(({"input": input, "diagonal": diag}, solution)) + + input = np.array([[0, 1], + [1, 0], + [1, 1]]) + diag = np.array([3, 4]) + solution = np.array([[3, 1], + [1, 4], + [1, 1]]) + test_cases.append(({"input": input, "diagonal": diag}, solution)) + + input = np.array([[[1, 0, 3], + [0, 2, 0]], + [[4, 0, 4], + [0, 5, 0]]]) + diag = np.array([[-1, -2], [-4, -5]]) + solution = np.array([[[-1, 0, 3], + [ 0, -2, 0]], + [[-4, 0, 4], + [ 0, -5, 0]]]) + test_cases.append(({"input": input, "diagonal": diag}, solution)) + # pylint: enable=bad-whitespace + # pyformat: enable + + for test in test_cases: + self._assertOpOutputMatchesExpected(test[0], test[1]) + + # From here onwards are v2-only tests. + def testSingleMatrix(self): + # LINT.IfChange + if compat.forward_compatible(2019, 7, 31): + # LINT.ThenChange(//tensorflow/python/ops/array_ops.py) + for _, tests in [square_cases(), tall_cases(), fat_cases()]: + for diag_index, (vecs, banded_mat) in tests.items(): + mask = (banded_mat[0] == 0) + input_mat = np.random.randint(10, size=mask.shape) + solution = input_mat * mask + banded_mat[0] + self._assertOpOutputMatchesExpected( + { + "input": input_mat, + "diagonal": vecs[0], + "k": diag_index + }, solution) + + def testBatch(self): + # LINT.IfChange + if compat.forward_compatible(2019, 7, 31): + # LINT.ThenChange(//tensorflow/python/ops/array_ops.py) + for _, tests in [square_cases(), tall_cases(), fat_cases()]: + for diag_index, (vecs, banded_mat) in tests.items(): + mask = (banded_mat == 0) + input_mat = np.random.randint(10, size=mask.shape) + solution = input_mat * mask + banded_mat + self._assertOpOutputMatchesExpected( + { + "input": input_mat, + "diagonal": vecs, + "k": diag_index + }, solution) + + +class MatrixDiagPartTest(xla_test.XLATestCase): + + def _assertOpOutputMatchesExpected(self, + params, + solution, + rtol=1e-3, + atol=1e-5): + """Verifies that matrix_diag_part produces `solution` when fed `params`. + + Args: + params: dictionary containing input parameters to matrix_diag_part. + solution: numpy array representing the expected output. + rtol: relative tolerance for equality test. + atol: absolute tolerance for equality test. + """ + input = params["input"] # pylint: disable=redefined-builtin + with self.session() as session: + for dtype in self.numeric_types - {np.int8, np.uint8}: + expected = solution.astype(dtype) + with self.test_scope(): + params["input"] = array_ops.placeholder( + dtype, input.shape, name="input") + output = array_ops.matrix_diag_part(**params) + result = session.run(output, { + params["input"]: input.astype(dtype), + }) + self.assertEqual(output.dtype, expected.dtype) + self.assertAllCloseAccordingToType( + expected, result, rtol=rtol, atol=atol, bfloat16_rtol=0.03) + + # Generic tests applicable to both v1 and v2 ops. + # Originally from unary_ops_tests.py. + def testV1(self): + matrices = np.arange(3 * 2 * 4).reshape([3, 2, 4]) + solution = np.array([[0, 5], [8, 13], [16, 21]]) + self._assertOpOutputMatchesExpected({"input": matrices}, solution) + + # From here onwards are v2-only tests. + def testSingleMatrix(self): + # LINT.IfChange + if compat.forward_compatible(2019, 7, 31): + # LINT.ThenChange(//tensorflow/python/ops/array_ops.py) + for mat, tests in [square_cases(), tall_cases(), fat_cases()]: + for diag_index, (solution, _) in tests.items(): + self._assertOpOutputMatchesExpected({ + "input": mat[0], + "k": diag_index + }, solution[0]) + + def testBatch(self): + # LINT.IfChange + if compat.forward_compatible(2019, 7, 31): + # LINT.ThenChange(//tensorflow/python/ops/array_ops.py) + for mat, tests in [square_cases(), tall_cases(), fat_cases()]: + for diag_index, (solution, _) in tests.items(): + self._assertOpOutputMatchesExpected({ + "input": mat, + "k": diag_index + }, solution) + + def testPadding(self): + # LINT.IfChange + if compat.forward_compatible(2019, 7, 31): + # LINT.ThenChange(//tensorflow/python/ops/array_ops.py) + for padding_value in [555, -11]: + for mat, tests in [square_cases(), tall_cases(), fat_cases()]: + for diag_index, (solution, _) in tests.items(): + mask = (solution == 0) + solution = solution + (mask * padding_value) + self._assertOpOutputMatchesExpected( + { + "input": mat, + "k": diag_index, + "padding_value": padding_value + }, solution) + + +if __name__ == "__main__": + googletest.main() diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py index bac30b63bf8..64af33c7a2a 100644 --- a/tensorflow/compiler/tests/unary_ops_test.py +++ b/tensorflow/compiler/tests/unary_ops_test.py @@ -27,7 +27,6 @@ from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import dtypes from tensorflow.python.ops import array_ops from tensorflow.python.ops import bitwise_ops -from tensorflow.python.ops import gen_array_ops from tensorflow.python.ops import gen_nn_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn_ops @@ -108,31 +107,6 @@ class UnaryOpsTest(xla_test.XLATestCase): np.array([[-1, 1]], dtype=dtype), expected=np.array([[-1, 1]], dtype=dtype)) - # TODO(penporn): Once XLA supports MatrixDiagV2, change the call to - # gen_array_ops.matrix_diag* (V1) to array_ops.matrix_diag* (V2). - self._assertOpOutputMatchesExpected( - gen_array_ops.matrix_diag, np.array([[1, 2], [3, 4]], dtype=dtype), - np.array([[[1, 0], [0, 2]], [[3, 0], [0, 4]]], dtype=dtype)) - self._assertOpOutputMatchesExpected( - gen_array_ops.matrix_diag, np.array([1, 2, 3, 4], dtype=dtype), - np.array( - [[1, 0, 0, 0], [0, 2, 0, 0], [0, 0, 3, 0], [0, 0, 0, 4]], - dtype=dtype)) - self._assertOpOutputMatchesExpected( - gen_array_ops.matrix_diag, - np.array( - [[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], dtype=dtype), - np.array( - [[[[1, 0, 0], [0, 2, 0], [0, 0, 3]], [[4, 0, 0], [0, 5, 0], [ - 0, 0, 6 - ]]], [[[7, 0, 0], [0, 8, 0], [0, 0, 9]], [[10, 0, 0], [0, 11, 0], - [0, 0, 12]]]], - dtype=dtype)) - self._assertOpOutputMatchesExpected( - gen_array_ops.matrix_diag_part, - np.arange(3 * 2 * 4).reshape([3, 2, 4]).astype(dtype), - np.array([[0, 5], [8, 13], [16, 21]], dtype=dtype)) - self._assertOpOutputMatchesExpected( array_ops.prevent_gradient, np.array([[-1, 1]], dtype=dtype), diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD index 139d6709215..ef2202c3931 100644 --- a/tensorflow/compiler/tf2xla/kernels/BUILD +++ b/tensorflow/compiler/tf2xla/kernels/BUILD @@ -55,8 +55,8 @@ tf_kernel_library( "lrn_ops.cc", "matmul_op.cc", "matrix_band_part_op.cc", + "matrix_diag_ops.cc", "matrix_inverse_op.cc", - "matrix_set_diag_op.cc", "matrix_triangular_solve_op.cc", "mirror_pad_op.cc", "next_after_op.cc", diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc index 747ec133983..1f12c7980e7 100644 --- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc +++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc @@ -20,8 +20,10 @@ limitations under the License. #include "tensorflow/compiler/tf2xla/xla_op_registry.h" #include "tensorflow/compiler/xla/client/lib/constants.h" #include "tensorflow/compiler/xla/client/lib/matrix.h" +#include "tensorflow/compiler/xla/client/lib/pooling.h" #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/util.h" +#include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/framework/op_kernel.h" namespace tensorflow { @@ -153,52 +155,5 @@ class DiagPartOp : public XlaOpKernel { REGISTER_XLA_OP(Name("DiagPart"), DiagPartOp); -class MatrixDiagOp : public XlaOpKernel { - public: - explicit MatrixDiagOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} - - void Compile(XlaOpKernelContext* ctx) override { - OP_REQUIRES(ctx, ctx->num_inputs() >= 1, - errors::InvalidArgument("MatrixDiag op must have at an input")); - const TensorShape input_shape = ctx->InputShape(0); - - auto dims = input_shape.dim_sizes(); - OP_REQUIRES(ctx, !dims.empty(), - errors::InvalidArgument("Expected 1 <= dims, got shape ", - input_shape.DebugString())); - - - int last_dim = dims.size() - 1; - int64 last_dim_size = input_shape.dim_size(last_dim); - absl::Span other_dims(dims); - other_dims.remove_suffix(1); - - xla::XlaOp input = ctx->Input(0); - xla::XlaOp diag = CreateDiagonal(input, last_dim_size, other_dims); - ctx->SetOutput(0, diag); - } -}; - -REGISTER_XLA_OP(Name("MatrixDiag"), MatrixDiagOp); - -class MatrixDiagPartOp : public XlaOpKernel { - public: - explicit MatrixDiagPartOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} - - void Compile(XlaOpKernelContext* ctx) override { - const TensorShape input_shape = ctx->InputShape(0); - auto dims = input_shape.dim_sizes(); - - OP_REQUIRES(ctx, 2 <= dims.size(), - errors::InvalidArgument("Expected 2 <= dims, got shape ", - input_shape.DebugString())); - - xla::XlaOp input = ctx->Input(0); - ctx->SetOutput(0, xla::GetMatrixDiagonal(input)); - } -}; - -REGISTER_XLA_OP(Name("MatrixDiagPart"), MatrixDiagPartOp); - } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc b/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc new file mode 100644 index 00000000000..7eeb05a4920 --- /dev/null +++ b/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc @@ -0,0 +1,425 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/tf2xla/xla_helpers.h" +#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" +#include "tensorflow/compiler/tf2xla/xla_op_registry.h" +#include "tensorflow/compiler/xla/client/lib/constants.h" +#include "tensorflow/compiler/xla/client/lib/matrix.h" +#include "tensorflow/compiler/xla/client/xla_builder.h" +#include "tensorflow/compiler/xla/primitive_util.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/lib/core/errors.h" + +namespace tensorflow { +namespace { + +// Reads or infers lower_diag_index and upper_diag_index from kernel's input +// parameter "k". Also validates their values. +std::pair ProcessDiagIndex(XlaOpKernelContext* context) { + int64 lower_diag_index = 0; + int64 upper_diag_index = 0; + TensorShape diag_index_shape = context->InputShape("k"); + + // Wrapping OP_REQUIRES* macros with a function because they can "return;" + // early (without values) which contradicts ProcessDiagIndex's signature. + auto validate_diag_indices = [&]() { + if (diag_index_shape.dims() == 0) { + OP_REQUIRES_OK(context, + context->ConstantInputAsIntScalar("k", &lower_diag_index)); + upper_diag_index = lower_diag_index; + } else { + std::vector diag_index; + OP_REQUIRES_OK(context, + context->ConstantInputAsIntVector("k", &diag_index)); + OP_REQUIRES( + context, !diag_index.empty() && diag_index.size() <= 2, + errors::InvalidArgument( + "diag_index must have only one or two elements, received ", + diag_index.size(), " elements.")); + lower_diag_index = diag_index[0]; + upper_diag_index = + (diag_index.size() > 1) ? diag_index[1] : lower_diag_index; + } + OP_REQUIRES( + context, lower_diag_index <= upper_diag_index, + errors::InvalidArgument( + "lower_diag_index must not be larger than upper_diag_index: ", + lower_diag_index, " > ", upper_diag_index)); + }; + validate_diag_indices(); + return {lower_diag_index, upper_diag_index}; +} + +// Makes sure lower_diag_index and upper_diag_index are consistent with the +// input matrix size. +void ValidateDiagIndexWithOutputMatrixSize(XlaOpKernelContext* context, + const int64 lower_diag_index, + const int64 upper_diag_index, + const int64 num_rows, + const int64 num_cols) { + // `lower_diag_index == 0` condition is added to handle matrix shape = 0. + OP_REQUIRES(context, + (-num_rows < lower_diag_index && lower_diag_index < num_cols) || + lower_diag_index == 0, + errors::InvalidArgument( + "lower_diag_index is out of bound: ", lower_diag_index, + " It must be between ", -num_rows, " and ", num_cols)); + OP_REQUIRES(context, + (-num_rows < upper_diag_index && upper_diag_index < num_cols) || + upper_diag_index == 0, + errors::InvalidArgument( + "upper_diag_index is out of bound: ", upper_diag_index, + " It must be between ", -num_rows, " and ", num_cols)); + OP_REQUIRES(context, lower_diag_index <= upper_diag_index, + errors::InvalidArgument( + "lower_diag_index must not be larger than upper_diag_index: ", + lower_diag_index, " > ", upper_diag_index)); +} + +// Kernel to set matrix diagonals. +xla::XlaOp SetMatrixDiag(const xla::XlaOp input, const xla::XlaOp diag, + const TensorShape& input_shape, const int64 diag_rank, + const int64 num_diags, const int64 lower_diag_index, + const int64 upper_diag_index, const int64 max_diag_len, + const int64 num_rows, const int64 num_cols) { + // Creates a padding config. + const int input_rank = input_shape.dims(); + xla::PaddingConfig padding_config; + padding_config = xla::MakeNoPaddingConfig(input_rank - 1); + + // Processes one diagonal at a time: + // 1) Extracts a single diagonal (diag_slice). + // 2) Broadcasts its contents to fill the whole matrix (diag_broadcast). + // 3) Masks diag_broadcast to get the right diagonal shape. + // + // XLA can fuse multiple Broadcasts and Selects so this shouldn't be slow. + // + // For example, + // diag = [[2, 3, 0], k = (-1, 1), and num_rows = 4. + // [4, 5, 6], + // [7, 8, 9]] + // The expected output is [[4, 2, 0], + // [7, 5, 4], + // [0, 8, 6], + // [0, 0, 9]] + // The 1st diagonal is created by: + // 1) Extracting diag_slice = [1, 2, 0]. + // 2) Padding the vector to be as long as num_rows, + // diag_slice = [1, 2, 0, 0], + // then broadcasting diag_slice row-wise to a full matrix, + // diag_broadcast = [[1, 1, 1], + // [2, 2, 2], + // [0, 0, 0], + // [0, 0, 0]] + // The padding value can be anything because it will not appear in the + // results after masking. Here, we use zero. + // 3) Masking diag_broadcast with a mask of the shape of the 1st diagonal. + // mask = [[0, 1, 0], --> output = [[x, 2, x], + // [0, 0, 1], [x, x, 3], + // [0, 0, 0], [x, x, x], + // [0, 0, 0]] [x, x, x]], + // where x denotes the existing input contents. + std::vector broadcast_dimensions(input_rank - 1); + absl::c_iota(broadcast_dimensions, 0); + auto output = input; + for (int64 diag_index = lower_diag_index; diag_index <= upper_diag_index; + ++diag_index) { + // Extracts a single diagonal. + auto diag_slice = diag; + if (num_diags > 1) { + const int64 mapped_diag_index = upper_diag_index - diag_index; + diag_slice = xla::Collapse( + xla::SliceInDim(diag, mapped_diag_index, mapped_diag_index + 1, 1, + diag_rank - 2), + {diag_rank - 2, diag_rank - 1}); + } + + // Pads if necessary. Always pad at the end because shorter diagonals in + // the input come padded at the end. + const int64 padding_length = + ((diag_index <= 0) ? num_cols : num_rows) - max_diag_len; + const xla::XlaOp zero = xla::ScalarLike(input, 0); + if (padding_length > 0) { + padding_config.mutable_dimensions(input_rank - 2) + ->set_edge_padding_high(padding_length); + diag_slice = xla::Pad(diag_slice, zero, padding_config); + } + + // Broadcasts column-wise for subdiagonals; row-wise for superdiagonals. + broadcast_dimensions.back() = + (diag_index <= 0) ? input_rank - 1 : input_rank - 2; + xla::XlaOp diag_broadcast = xla::BroadcastInDim( + diag_slice, input_shape.dim_sizes(), broadcast_dimensions); + const auto mask = xla::GetDiagonalMask(output, diag_index); + output = xla::Select(mask, diag_broadcast, output); + } + return output; +} + +} // namespace + +class MatrixDiagOp : public XlaOpKernel { + public: + explicit MatrixDiagOp(OpKernelConstruction* context) : XlaOpKernel(context) {} + + void Compile(XlaOpKernelContext* context) override { + OP_REQUIRES( + context, context->num_inputs() >= 1, + errors::InvalidArgument("MatrixDiag op must have at least one input")); + const TensorShape diag_shape = context->InputShape(0); + OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(diag_shape), + errors::InvalidArgument("Expected >= 1 dims, got shape ", + diag_shape.DebugString())); + + const DataType dtype = context->expected_output_dtype(0); + const xla::XlaOp zero = XlaHelpers::Zero(context->builder(), dtype); + + // Initializes MatrixDiagV2-specific variables. + // Input arguments providing the values of num_rows and num_cols can be + // absent (-1) and will be inferred later. + int64 lower_diag_index = 0; + int64 upper_diag_index = 0; + int64 num_rows = -1; + int64 num_cols = -1; + xla::XlaOp padding_value = zero; + + // MatrixDiag and MatrixDiagV2 both use this OpKernel. MatrixDiag only has + // one input, so we have to check the number of inputs before reading + // additional parameters for MatrixDiagV2. + if (context->num_inputs() > 1) { + std::tie(lower_diag_index, upper_diag_index) = ProcessDiagIndex(context); + OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(2, &num_rows)); + OP_REQUIRES_OK(context, context->ConstantInputAsIntScalar(3, &num_cols)); + padding_value = context->Input(4); + } + + // More size validations. + const int64 diag_rank = diag_shape.dims(); + const int64 max_diag_len = diag_shape.dim_size(diag_rank - 1); + const int64 num_diags = upper_diag_index - lower_diag_index + 1; + OP_REQUIRES( + context, + num_diags == 1 || num_diags == diag_shape.dim_size(diag_rank - 2), + errors::InvalidArgument( + "The number of diagonals provided in the input does not " + "match the lower_diag_index and upper_diag_index range.")); + const int64 min_num_rows = max_diag_len - std::min(upper_diag_index, 0LL); + const int64 min_num_cols = max_diag_len + std::max(lower_diag_index, 0LL); + OP_REQUIRES(context, num_rows == -1 || num_rows >= min_num_rows, + errors::InvalidArgument("The number of rows is too small.")); + OP_REQUIRES(context, num_cols == -1 || num_cols >= min_num_cols, + errors::InvalidArgument("The number of columns is too small.")); + + // Infers num_rows and num_cols. If both are unknown, assume that the output + // is square. Otherwise, use smallest possible values. + if (num_rows == -1 && num_cols == -1) { + num_rows = std::max(min_num_rows, min_num_cols); + num_cols = num_rows; + } else if (num_rows == -1) { + num_rows = min_num_rows; + } else if (num_cols == -1) { + num_cols = min_num_cols; + } + + // At least one of num_rows and num_cols must match its minimum length. + // Otherwise, we'll have some incomplete diagonals. + OP_REQUIRES(context, num_rows == min_num_rows || num_cols == min_num_cols, + errors::InvalidArgument( + "The number of rows or columns is not consistent with " + "the specified d_lower, d_upper, and diagonal.")); + + // Actual processing. + // Initializes the output tensor with padding_value. + TensorShape output_shape = diag_shape; + output_shape.RemoveLastDims((num_diags == 1) ? 1 : 2); + output_shape.AddDim(num_rows); + output_shape.AddDim(num_cols); + xla::XlaOp output = xla::Broadcast(padding_value, output_shape.dim_sizes()); + xla::XlaOp diag = context->Input(0); + context->SetOutput( + 0, SetMatrixDiag(output, diag, output_shape, diag_rank, num_diags, + lower_diag_index, upper_diag_index, max_diag_len, + num_rows, num_cols)); + } +}; + +REGISTER_XLA_OP(Name("MatrixDiag"), MatrixDiagOp); +REGISTER_XLA_OP(Name("MatrixDiagV2") + .CompileTimeConstantInput("k") + .CompileTimeConstantInput("num_rows") + .CompileTimeConstantInput("num_cols") + .CompileTimeConstantInput("padding_value"), + MatrixDiagOp); + +class MatrixDiagPartOp : public XlaOpKernel { + public: + explicit MatrixDiagPartOp(OpKernelConstruction* context) + : XlaOpKernel(context) {} + + void Compile(XlaOpKernelContext* context) override { + const TensorShape input_shape = context->InputShape(0); + const int input_rank = input_shape.dims(); + + OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape), + errors::InvalidArgument( + "input must be at least 2-dim, received shape: ", + input_shape.DebugString())); + + const DataType dtype = context->expected_output_dtype(0); + const xla::XlaOp zero = XlaHelpers::Zero(context->builder(), dtype); + + // Initializes MatrixDiagPartV2-specific variables. + int64 lower_diag_index = 0; + int64 upper_diag_index = 0; + xla::XlaOp padding_value = zero; + + // MatrixDiagPart and MatrixDiagPartV2 both use this OpKernel. + // MatrixDiagPart only has one input, so we have to check the number of + // inputs before reading additional parameters in MatrixDiagV2. + if (context->num_inputs() > 1) { + std::tie(lower_diag_index, upper_diag_index) = ProcessDiagIndex(context); + padding_value = context->Input(2); + } + + // Checks if diag sizes are consistent with input. + const int64 num_rows = input_shape.dim_size(input_rank - 2); + const int64 num_cols = input_shape.dim_size(input_rank - 1); + ValidateDiagIndexWithOutputMatrixSize(context, lower_diag_index, + upper_diag_index, num_rows, num_cols); + + // Creates output shape. + TensorShape output_shape = input_shape; + output_shape.RemoveLastDims(2); + const int num_diags = upper_diag_index - lower_diag_index + 1; + if (num_diags > 1) output_shape.AddDim(num_diags); + const int32 max_diag_len = + std::min(num_rows + std::min(upper_diag_index, 0LL), + num_cols - std::max(lower_diag_index, 0LL)); + output_shape.AddDim(max_diag_len); + + // Computes output. + xla::XlaOp input = context->Input(0); + std::vector diag_list; + xla::PaddingConfig padding_config; + if (num_diags == 1) { + context->SetOutput(0, xla::GetMatrixDiagonal(input, upper_diag_index)); + return; + } + padding_config = xla::MakeNoPaddingConfig(input_rank - 1); + for (int diag_index = upper_diag_index; diag_index >= lower_diag_index; + --diag_index) { + auto single_diag = xla::GetMatrixDiagonal(input, diag_index); + const int64 diag_length = + (diag_index >= 0) ? (num_cols - diag_index) : (num_rows + diag_index); + const int64 padding_length = max_diag_len - diag_length; + if (padding_length > 0) { + padding_config.mutable_dimensions(input_rank - 2) + ->set_edge_padding_high(padding_length); + single_diag = xla::Pad(single_diag, padding_value, padding_config); + } + diag_list.emplace_back(single_diag); + } + auto concat = + xla::ConcatInDim(context->builder(), diag_list, input_rank - 2); + context->SetOutput(0, xla::Reshape(concat, output_shape.dim_sizes())); + } +}; + +REGISTER_XLA_OP(Name("MatrixDiagPart"), MatrixDiagPartOp); +REGISTER_XLA_OP(Name("MatrixDiagPartV2") + .CompileTimeConstantInput("k") + .CompileTimeConstantInput("padding_value"), + MatrixDiagPartOp); + +class MatrixSetDiagOp : public XlaOpKernel { + public: + explicit MatrixSetDiagOp(OpKernelConstruction* context) + : XlaOpKernel(context) {} + + void Compile(XlaOpKernelContext* context) override { + const TensorShape input_shape = context->InputShape(0); + const TensorShape diag_shape = context->InputShape(1); + const int input_rank = input_shape.dims(); + const int diag_rank = diag_shape.dims(); + + // Preliminary validation of sizes. + OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape), + errors::InvalidArgument( + "input must be at least 2-dim, received shape: ", + input_shape.DebugString())); + OP_REQUIRES(context, TensorShapeUtils::IsVectorOrHigher(diag_shape), + errors::InvalidArgument( + "diagonal must be at least 1-dim, received shape: ", + diag_shape.DebugString())); + + // MatrixSetDiag and MatrixSetDiagV2 both use this OpKernel. MatrixSetDiag + // only has two inputs, so we have to check the number of inputs before + // reading additional parameters in MatrixSetDiagV2. + int64 lower_diag_index = 0; + int64 upper_diag_index = 0; + if (context->num_inputs() > 2) { + std::tie(lower_diag_index, upper_diag_index) = ProcessDiagIndex(context); + } + + // Checks if diag sizes are consistent with input. + const int64 num_rows = input_shape.dim_size(input_rank - 2); + const int64 num_cols = input_shape.dim_size(input_rank - 1); + ValidateDiagIndexWithOutputMatrixSize(context, lower_diag_index, + upper_diag_index, num_rows, num_cols); + const Eigen::Index num_diags = upper_diag_index - lower_diag_index + 1; + OP_REQUIRES( + context, + lower_diag_index == upper_diag_index || + (diag_shape.dim_size(input_rank - 2) == num_diags), + errors::InvalidArgument("The number of diagonals provided in `diag` " + "is not consistent with `lower_diag_index` and " + "`upper_diag_index`")); + + TensorShape expected_diag_shape = input_shape; + expected_diag_shape.RemoveLastDims(2); + if (num_diags > 1) expected_diag_shape.AddDim(num_diags); + const int32 max_diag_len = + std::min(num_rows + std::min(upper_diag_index, 0LL), + num_cols - std::max(lower_diag_index, 0LL)); + expected_diag_shape.AddDim(max_diag_len); + OP_REQUIRES( + context, expected_diag_shape == diag_shape, + errors::InvalidArgument( + "Either first dimensions of diagonal don't match input.shape[:-2], " + "or diagonal.shape[:-1] is not equal to the longests diagonal in " + "range [lower_diag_index:upper_diag_index].\nInput shape: ", + input_shape.DebugString(), + "\nDiagonal shape: ", diag_shape.DebugString(), + "\nExpected diagonal shape: ", expected_diag_shape.DebugString())); + + // Actual processing. + xla::XlaOp input = context->Input(0); + xla::XlaOp diag = context->Input(1); + context->SetOutput( + 0, SetMatrixDiag(input, diag, input_shape, diag_rank, num_diags, + lower_diag_index, upper_diag_index, max_diag_len, + num_rows, num_cols)); + } + + private: + TF_DISALLOW_COPY_AND_ASSIGN(MatrixSetDiagOp); +}; + +REGISTER_XLA_OP(Name("MatrixSetDiag"), MatrixSetDiagOp); +REGISTER_XLA_OP(Name("MatrixSetDiagV2").CompileTimeConstantInput("k"), + MatrixSetDiagOp); + +} // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc deleted file mode 100644 index ee9764c0c35..00000000000 --- a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/compiler/tf2xla/xla_helpers.h" -#include "tensorflow/compiler/tf2xla/xla_op_kernel.h" -#include "tensorflow/compiler/tf2xla/xla_op_registry.h" -#include "tensorflow/compiler/xla/client/xla_builder.h" -#include "tensorflow/compiler/xla/primitive_util.h" - -namespace tensorflow { - -class MatrixSetDiagOp : public XlaOpKernel { - public: - explicit MatrixSetDiagOp(OpKernelConstruction* context) - : XlaOpKernel(context) {} - - void Compile(XlaOpKernelContext* context) override { - const TensorShape input_shape = context->InputShape(0); - const TensorShape diag_shape = context->InputShape(1); - - const int rank = input_shape.dims(); - - // Preliminary validation of sizes. - OP_REQUIRES(context, TensorShapeUtils::IsMatrixOrHigher(input_shape), - errors::InvalidArgument( - "input must be at least 2-dim, received shape: ", - input_shape.DebugString())); - - // Check to make sure the last dimension of diag is equal to the smaller of - // the last two dimensions of input. - const int64 m = input_shape.dim_size(rank - 2); - const int64 n = input_shape.dim_size(rank - 1); - const int64 min_dim = std::min(m, n); - - TensorShape batch_shape = input_shape; - batch_shape.RemoveLastDims(2); - - TensorShape expected_diag_shape = batch_shape; - expected_diag_shape.AddDim(min_dim); - OP_REQUIRES(context, expected_diag_shape == diag_shape, - errors::InvalidArgument( - "must have diagonal.shape == input.shape[:-2] + " - "min(input.shape[-2:]), but received input shape: ", - input_shape.DebugString(), - " and diagonal shape: ", diag_shape.DebugString())); - - xla::XlaBuilder* builder = context->builder(); - xla::XlaOp input = context->Input(0); - xla::XlaOp diag = context->Input(1); - - auto zero = XlaHelpers::Zero(builder, context->input_type(0)); - - // Create an indicator tensor that is true only on the diagonal. - xla::XlaOp iota_m = xla::Iota(builder, xla::S32, m); - xla::XlaOp iota_n = xla::Iota(builder, xla::S32, n); - auto indicator = xla::Eq(iota_m, xla::Broadcast(iota_n, {m}), - /*broadcast_dimensions=*/{0}); - indicator = xla::Broadcast(indicator, batch_shape.dim_sizes()); - - // Broadcast diag up to the input shape. Use an implicit broadcast (Add/Or) - // because we need to broadcast on the right. - std::vector diag_broadcast_dims(rank - 1); - std::iota(diag_broadcast_dims.begin(), diag_broadcast_dims.end(), 0); - if (min_dim != m) { - diag_broadcast_dims.back() = rank - 1; - } - if (context->input_xla_type(0) == xla::PRED) { - diag = xla::Or(diag, xla::Broadcast(zero, input_shape.dim_sizes()), - /*broadcast_dimensions=*/diag_broadcast_dims); - - } else { - diag = xla::Add(diag, xla::Broadcast(zero, input_shape.dim_sizes()), - /*broadcast_dimensions=*/diag_broadcast_dims); - } - - auto output = xla::Select(indicator, diag, input); - context->SetOutput(0, output); - } - - private: - TF_DISALLOW_COPY_AND_ASSIGN(MatrixSetDiagOp); -}; - -REGISTER_XLA_OP(Name("MatrixSetDiag"), MatrixSetDiagOp); - -} // namespace tensorflow From b0cd40d7c7fd3828ae15bbbcf8b5f1f272ebf5c2 Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Mon, 22 Jul 2019 12:24:09 -0700 Subject: [PATCH 0324/3053] Internal cleanup: avoid discarding the result of NodeTransformer visitor, for consistency. This is a no-op, because generic_visit doesn't ever replace the node, so the CL is purely for consistency. PiperOrigin-RevId: 259380818 --- tensorflow/python/autograph/converters/control_flow.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py index 526c642c337..7f846bacf5f 100644 --- a/tensorflow/python/autograph/converters/control_flow.py +++ b/tensorflow/python/autograph/converters/control_flow.py @@ -347,7 +347,7 @@ class ControlFlowTransformer(converter.Base): return loop_vars, loop_vars_ast_tuple def visit_While(self, node): - self.generic_visit(node) + node = self.generic_visit(node) (basic_loop_vars, composite_loop_vars, reserved_symbols, possibly_undefs) = self._get_loop_vars( @@ -419,7 +419,7 @@ class ControlFlowTransformer(converter.Base): return undefined_assigns + node def visit_For(self, node): - self.generic_visit(node) + node = self.generic_visit(node) (basic_loop_vars, composite_loop_vars, reserved_symbols, possibly_undefs) = self._get_loop_vars( From 391147eb73c0b134dbc8ec542c38ac488c0c9bf3 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 18 Jul 2019 12:00:21 -0700 Subject: [PATCH 0325/3053] Small code refactoring. --- .../xla/service/gpu/nvptx_compiler.cc | 57 +++++++++++-------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc index 3ddacb2c3d9..33bd36980b9 100644 --- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc @@ -223,6 +223,37 @@ void WarnIfBadDriverJITVersion() { }); } +// Try to load ptx from files defined in the FLAGS. If successful, return true. +bool MaybeLoadPtxFromFile(const HloModule* module, std::string* ptx) { + // If the xla_gpu_ptx_file options is set, be explicit when a file is used + // and warn when a file is not used to ease catching typo in filename. + std::string prefix = xla::FilenameFor(*module, *ptx); + std::string ptx_filename; + for (const string filename : module->config().debug_options().xla_gpu_ptx_file()) { + // To ease comparing many PTX versions, accept different suffix then + // the original filename. + if(absl::StartsWith(filename, prefix)) { + ptx_filename = filename; + VLOG(0) << "RunBackend() - Will load PTX from file: " << filename; + break; + } + } + if (module->config().debug_options().xla_gpu_ptx_file().size() > 0 && + ptx_filename.empty()) { + VLOG(0) << "RunBackend() - For module with prefix '" << prefix + << "', we did not found a PTX file to load."; + } + + if(!ptx_filename.empty()) { + std::ifstream ifs(ptx_filename, std::ifstream::in); + *ptx = std::string(std::istreambuf_iterator(ifs), + std::istreambuf_iterator()); + CHECK(!ptx->empty()) << "Empty or non existing PTX file: " << ptx_filename; + return true; + } + return false; +} + } // namespace // Runs optimization passes on the given HLO module. @@ -628,31 +659,7 @@ StatusOr> NVPTXCompiler::RunBackend( std::string ptx; - // Generate the PTX or load it if provided. - // If the xla_gpu_ptx_file options is set, be explicit when a file is used - // and warn when a file is not used to ease catching typo in filename. - std::string prefix = FilenameFor(*module, ptx); - std::string ptx_filename; - for (const string filename : module->config().debug_options().xla_gpu_ptx_file()) { - // To ease comparing many PTX versions, accept different suffix then - // the original filename. - if(absl::StartsWith(filename, prefix)) { - ptx_filename = filename; - VLOG(0) << "RunBackend() - Will load PTX from file: " << filename; - break; - } - } - if (module->config().debug_options().xla_gpu_ptx_file().size() > 0 && - ptx_filename.empty()) { - VLOG(0) << "RunBackend() - For module with prefix '" << prefix - << "', we did not found a PTX file to load."; - } - if(!ptx_filename.empty()) { - std::ifstream ifs(ptx_filename, std::ifstream::in); - ptx = std::string(std::istreambuf_iterator(ifs), - std::istreambuf_iterator()); - CHECK(!ptx.empty()) << "Empty or non existing PTX file: " << ptx_filename; - } else { + if (!MaybeLoadPtxFromFile(module.get(), &ptx)) { XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend - CompileToPtx"); TF_ASSIGN_OR_RETURN(ptx, CompileToPtx(&llvm_module, {cc_major, cc_minor}, module->config(), libdevice_dir)); From 1f5e538ba905ee3616f72e73b73742e6ef4a6490 Mon Sep 17 00:00:00 2001 From: Frederic Bastien Date: Thu, 18 Jul 2019 12:52:02 -0700 Subject: [PATCH 0326/3053] Rename a variable for clarify and fix a comment typo. --- .../compiler/xla/service/gpu/nvptx_compiler.cc | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc index 33bd36980b9..b9af7b6b0b7 100644 --- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc @@ -228,27 +228,27 @@ bool MaybeLoadPtxFromFile(const HloModule* module, std::string* ptx) { // If the xla_gpu_ptx_file options is set, be explicit when a file is used // and warn when a file is not used to ease catching typo in filename. std::string prefix = xla::FilenameFor(*module, *ptx); - std::string ptx_filename; + std::string matched_filename; for (const string filename : module->config().debug_options().xla_gpu_ptx_file()) { - // To ease comparing many PTX versions, accept different suffix then + // To ease comparing many PTX versions, accept different suffixes then // the original filename. if(absl::StartsWith(filename, prefix)) { - ptx_filename = filename; + matched_filename = filename; VLOG(0) << "RunBackend() - Will load PTX from file: " << filename; break; } } if (module->config().debug_options().xla_gpu_ptx_file().size() > 0 && - ptx_filename.empty()) { + matched_filename.empty()) { VLOG(0) << "RunBackend() - For module with prefix '" << prefix << "', we did not found a PTX file to load."; } - if(!ptx_filename.empty()) { - std::ifstream ifs(ptx_filename, std::ifstream::in); + if(!matched_filename.empty()) { + std::ifstream ifs(matched_filename, std::ifstream::in); *ptx = std::string(std::istreambuf_iterator(ifs), std::istreambuf_iterator()); - CHECK(!ptx->empty()) << "Empty or non existing PTX file: " << ptx_filename; + CHECK(!ptx->empty()) << "Empty or non existing PTX file: " << matched_filename; return true; } return false; From 6ec32e5bf1931f4861a2e69c0e2be6abd05777dd Mon Sep 17 00:00:00 2001 From: "Wen-Heng (Jack) Chung" Date: Fri, 19 Jul 2019 15:30:58 -0500 Subject: [PATCH 0327/3053] Refactor IR emitter to cope with both NVPTX and AMDGPU for workgroup dims. --- tensorflow/compiler/xla/service/gpu/BUILD | 1 + .../xla/service/gpu/elemental_ir_emitter.cc | 51 +++---------- .../xla/service/gpu/elemental_ir_emitter.h | 7 -- .../xla/service/gpu/parallel_loop_emitter.cc | 9 +-- .../compiler/xla/service/gpu/target_util.cc | 72 ++++++++++++++++++- .../compiler/xla/service/gpu/target_util.h | 12 +++- 6 files changed, 97 insertions(+), 55 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index a5fc6e80cec..2b0acaf44de 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -260,6 +260,7 @@ cc_library( hdrs = ["parallel_loop_emitter.h"], deps = [ ":partition_assignment", + ":target_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service/llvm_ir:ir_array", diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc index c0cd4addc7e..d2e3d513aa8 100644 --- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc @@ -152,7 +152,7 @@ StatusOr GpuElementalIrEmitter::EmitMathCall( return EmitDeviceFunctionCall( callee_name, operands, input_types, output_type, - {llvm::Attribute::ReadNone, llvm::Attribute::NoUnwind}); + {llvm::Attribute::ReadNone, llvm::Attribute::NoUnwind}, b_); } StatusOr GpuElementalIrEmitter::EmitFloatBinaryOp( @@ -280,47 +280,16 @@ StatusOr GpuElementalIrEmitter::EmitComplexAbs( {prim_type, prim_type}, prim_type); } -llvm::Value* GpuElementalIrEmitter::EmitDeviceFunctionCall( - const string& callee_name, absl::Span operands, - absl::Span input_types, PrimitiveType output_type, - absl::Span attributes) { - std::vector ir_input_types; - for (PrimitiveType input_type : input_types) { - ir_input_types.push_back( - llvm_ir::PrimitiveTypeToIrType(input_type, module_)); - } - llvm::FunctionType* callee_type = llvm::FunctionType::get( - llvm_ir::PrimitiveTypeToIrType(output_type, module_), // Return type. - ir_input_types, // Parameter types. - false); // No variadic arguments. - - // Declares the callee if it is not declared already. - llvm::Function* callee = llvm::dyn_cast( - b_->GetInsertBlock() - ->getModule() - ->getOrInsertFunction(callee_name, callee_type) - .getCallee()); - - for (auto attribute : attributes) { - callee->addFnAttr(attribute); - } - - return Call(callee, llvm_ir::AsArrayRef(operands)); -} - llvm::Value* GpuElementalIrEmitter::EmitThreadId() { - llvm::Value* block_id = - IntCast(llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b_), - b_->getIntNTy(128), /*isSigned=*/true, "block.id"); - llvm::Value* thread_id_in_block = - IntCast(llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b_), - b_->getIntNTy(128), /*isSigned=*/true, "thread.id"); - llvm::Value* threads_per_block = - IntCast(llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x, {}, {}, b_), - b_->getIntNTy(128), /*isSigned=*/true, "threads_per_block"); + llvm::Value* block_id = IntCast( + EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockIdx, {}, {}, b_), + b_->getIntNTy(128), /*isSigned=*/true, "block.id"); + llvm::Value* thread_id_in_block = IntCast( + EmitCallToTargetIntrinsic(TargetIntrinsicID::kThreadIdx, {}, {}, b_), + b_->getIntNTy(128), /*isSigned=*/true, "thread.id"); + llvm::Value* threads_per_block = IntCast( + EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockDimx, {}, {}, b_), + b_->getIntNTy(128), /*isSigned=*/true, "threads_per_block"); return NSWAdd(NSWMul(block_id, threads_per_block), thread_id_in_block); } diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h index db4918c5890..c8a58a21980 100644 --- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h +++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h @@ -100,13 +100,6 @@ class GpuElementalIrEmitter : public ElementalIrEmitter { llvm::Value* lhs_value, llvm::Value* rhs_value); - // Emits IR to call a device function named "callee_name" on the given - // operand. Returns the IR value that represents the return value. - llvm::Value* EmitDeviceFunctionCall( - const string& callee_name, absl::Span operands, - absl::Span input_type, PrimitiveType output_type, - absl::Span attributes); - // Emits IR to call an LLVM intrinsic of type [T] -> T. Adjusts // callee_name according to T. Returns the IR value that represents the // return value of the function. diff --git a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc index cb012649200..f9937ba77de 100644 --- a/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/parallel_loop_emitter.cc @@ -22,6 +22,7 @@ limitations under the License. // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Value.h" +#include "tensorflow/compiler/xla/service/gpu/target_util.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h" #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -72,8 +73,8 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name, VLOG(3) << "EmitIndexAndSetExitBasicBlock unroll_factor " << unroll_factor_; CHECK_NE(index_type, nullptr); std::vector array_indices; - llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b_); + llvm::Value* block_id = + EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockIdx, {}, {}, b_); llvm_ir::AddRangeMetadata(0, launch_dimensions_.block_count(), static_cast(block_id)); block_id = b_->CreateZExtOrTrunc(block_id, index_type, "block_id"); @@ -82,8 +83,8 @@ ParallelLoopEmitter::EmitIndexAndSetExitBasicBlock(absl::string_view loop_name, // "It is guaranteed that [...] 0 <= %tid.x < %ntid.x" // // %ntid.x is currently specified as 1024. - llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic( - llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b_); + llvm::Value* thread_id = + EmitCallToTargetIntrinsic(TargetIntrinsicID::kThreadIdx, {}, {}, b_); llvm_ir::AddRangeMetadata(0, launch_dimensions_.threads_per_block(), static_cast(thread_id)); thread_id = b_->CreateZExtOrTrunc(thread_id, index_type, "thread_id"); diff --git a/tensorflow/compiler/xla/service/gpu/target_util.cc b/tensorflow/compiler/xla/service/gpu/target_util.cc index 31f989bd58c..fb2a8d7beab 100644 --- a/tensorflow/compiler/xla/service/gpu/target_util.cc +++ b/tensorflow/compiler/xla/service/gpu/target_util.cc @@ -29,9 +29,14 @@ namespace { using absl::StrCat; // Wrapper structure for carrying llvm intrinsic ids for NVPTX/AMDGPU platforms. +// On AMDGPU, some of these operations are made as device functions instead of +// intrinsics. Therefore a variant type is used to wrap the lambda to call +// those device functions. struct TargetIntrinsics { llvm::Intrinsic::ID nvptx_intrinsic; - llvm::Intrinsic::ID amdgpu_intrinsic; + absl::variant*)>> + amdgpu_intrinsic_or_function; }; // Gets the llvm intrinsic ids on different platforms (NVPTX, AMDGPU) @@ -66,6 +71,30 @@ struct TargetIntrinsics GetIntrinsic(TargetIntrinsicID intrin) { return {llvm::Intrinsic::nvvm_barrier0, llvm::Intrinsic::amdgcn_s_barrier}; } + case TargetIntrinsicID::kBlockDimx: { + return {llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x, + [](llvm::IRBuilder<>* b_) -> llvm::CallInst* { + return EmitDeviceFunctionCall("__ockl_get_local_size", + {b_->getInt32(0)}, {U32}, U64, {}, + b_); + }}; + } + case TargetIntrinsicID::kBlockDimy: { + return {llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_y, + [](llvm::IRBuilder<>* b_) -> llvm::CallInst* { + return EmitDeviceFunctionCall("__ockl_get_local_size", + {b_->getInt32(1)}, {U32}, U64, {}, + b_); + }}; + } + case TargetIntrinsicID::kBlockDimz: { + return {llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_z, + [](llvm::IRBuilder<>* b_) -> llvm::CallInst* { + return EmitDeviceFunctionCall("__ockl_get_local_size", + {b_->getInt32(1)}, {U32}, U64, {}, + b_); + }}; + } } } @@ -156,6 +185,36 @@ string ObtainDeviceFunctionName(TargetDeviceFunctionID func_id, } } +llvm::CallInst* EmitDeviceFunctionCall( + const string& callee_name, absl::Span operands, + absl::Span input_types, PrimitiveType output_type, + absl::Span attributes, + llvm::IRBuilder<>* b) { + std::vector ir_input_types; + llvm::Module* module = b->GetInsertBlock()->getModule(); + for (PrimitiveType input_type : input_types) { + ir_input_types.push_back( + llvm_ir::PrimitiveTypeToIrType(input_type, module)); + } + llvm::FunctionType* callee_type = llvm::FunctionType::get( + llvm_ir::PrimitiveTypeToIrType(output_type, module), // Return type. + ir_input_types, // Parameter types. + false); // No variadic arguments. + + // Declares the callee if it is not declared already. + llvm::Function* callee = llvm::dyn_cast( + b->GetInsertBlock() + ->getModule() + ->getOrInsertFunction(callee_name, callee_type) + .getCallee()); + + for (auto attribute : attributes) { + callee->addFnAttr(attribute); + } + + return b->CreateCall(callee, llvm_ir::AsArrayRef(operands)); +} + llvm::CallInst* EmitCallToTargetIntrinsic( TargetIntrinsicID intrinsic_id, absl::Span operands, absl::Span overloaded_types, llvm::IRBuilder<>* b) { @@ -166,7 +225,16 @@ llvm::CallInst* EmitCallToTargetIntrinsic( if (target_triple.isNVPTX()) { llvm_intrinsic_id = gpu_intrinsic_id.nvptx_intrinsic; } else if (target_triple.getArch() == llvm::Triple::amdgcn) { - llvm_intrinsic_id = gpu_intrinsic_id.amdgpu_intrinsic; + llvm::Intrinsic::ID* llvm_intrinsic_id_ptr; + if ((llvm_intrinsic_id_ptr = absl::get_if( + &gpu_intrinsic_id.amdgpu_intrinsic_or_function))) { + llvm_intrinsic_id = *llvm_intrinsic_id_ptr; + } else { + std::function*)>* builder_func = + absl::get_if*)>>( + &gpu_intrinsic_id.amdgpu_intrinsic_or_function); + return (*builder_func)(b); + } } else { LOG(FATAL) << "Invalid triple " << target_triple.str(); } diff --git a/tensorflow/compiler/xla/service/gpu/target_util.h b/tensorflow/compiler/xla/service/gpu/target_util.h index d50529e395e..4355ed21136 100644 --- a/tensorflow/compiler/xla/service/gpu/target_util.h +++ b/tensorflow/compiler/xla/service/gpu/target_util.h @@ -39,6 +39,9 @@ enum class TargetIntrinsicID { kBlockIdy, kBlockIdz, kBarrierId, + kBlockDimx, + kBlockDimy, + kBlockDimz, }; // Enumeration to get target specific device math function. @@ -59,8 +62,15 @@ enum class TargetDeviceFunctionID { kHypot }; -// Emits a call to the specified target intrinsic with the given operands. +// Emits IR to call a device function named "callee_name" on the given +// operand. Returns the IR value that represents the return value. +llvm::CallInst* EmitDeviceFunctionCall( + const std::string& callee_name, absl::Span operands, + absl::Span input_type, PrimitiveType output_type, + absl::Span attributes, + llvm::IRBuilder<>* b); +// Emits a call to the specified target intrinsic with the given operands. // Overloaded intrinsics (for example, "minnum") must include a type // in overloaded_types for each overloaded type. Typically, overloaded // intrinsics have only a single overloaded type. From 8614aaf955a641736b67ce8b5c7a6752c4d8429e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 13:17:28 -0700 Subject: [PATCH 0328/3053] Changes convert_stack's return type from list to tuple to make the value hashable. PiperOrigin-RevId: 259392206 --- .../python/debug/lib/session_debug_testlib.py | 4 +-- .../distribute/mirrored_strategy_test.py | 19 ++++++++++++ tensorflow/python/util/tf_stack.py | 29 ++++++++++--------- 3 files changed, 36 insertions(+), 16 deletions(-) diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py index b438b6500ae..d14399b9cee 100644 --- a/tensorflow/python/debug/lib/session_debug_testlib.py +++ b/tensorflow/python/debug/lib/session_debug_testlib.py @@ -1462,14 +1462,14 @@ class SessionDebugTestBase(test_util.TensorFlowTestCase): # Lookup should work with node name input. traceback = dump.node_traceback("traceback/w") - self.assertIsInstance(traceback, list) + self.assertIsInstance(traceback, tuple) self.assertGreater(len(traceback), 0) for trace in traceback: self.assertIsInstance(trace, tuple) # Lookup should also work with tensor name input. traceback = dump.node_traceback("traceback/w:0") - self.assertIsInstance(traceback, list) + self.assertIsInstance(traceback, tuple) self.assertGreater(len(traceback), 0) for trace in traceback: self.assertIsInstance(trace, tuple) diff --git a/tensorflow/python/distribute/mirrored_strategy_test.py b/tensorflow/python/distribute/mirrored_strategy_test.py index 4e8f14ef4b6..8f94f390740 100644 --- a/tensorflow/python/distribute/mirrored_strategy_test.py +++ b/tensorflow/python/distribute/mirrored_strategy_test.py @@ -49,6 +49,7 @@ from tensorflow.python.framework import tensor_util from tensorflow.python.keras.engine import training as keras_training from tensorflow.python.keras.layers import core as keras_core from tensorflow.python.ops import array_ops +from tensorflow.python.ops import gradients from tensorflow.python.ops import math_ops from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables @@ -1203,6 +1204,24 @@ class MultiWorkerMirroredStrategyTestWithChief( self._test_summary_for_replica_zero_only(strategy) +class MirroredVariableStopGradientTest(test.TestCase, parameterized.TestCase): + + @combinations.generate( + combinations.combine( + distribution=[ + strategy_combinations.mirrored_strategy_with_one_cpu, + strategy_combinations.mirrored_strategy_with_one_gpu, + ], + mode=["graph"])) + def testMirroredVariableAsStopGradient(self, distribution): + with distribution.scope(): + inp = constant_op.constant(1.0) + x = variables.Variable(1.0) + y = inp*x + grads = gradients.gradients(x, y, stop_gradients=x) + self.assertIsNone(grads[0]) + + def _replica_id(): replica_id = ds_context.get_replica_context().replica_id_in_sync_group if not isinstance(replica_id, ops.Tensor): diff --git a/tensorflow/python/util/tf_stack.py b/tensorflow/python/util/tf_stack.py index 5603989a0d1..a6ba59e2b56 100644 --- a/tensorflow/python/util/tf_stack.py +++ b/tensorflow/python/util/tf_stack.py @@ -199,21 +199,22 @@ def convert_stack(stack, include_func_start_lineno=False): included as the 5th entry in return tuples. Returns: - A list of n 4-tuples or 5-tuples + A tuple of n 4-tuples or 5-tuples (filename, lineno, name, code, [optional: func_start_lineno]), where the code tuple element is calculated from the corresponding elements of the input tuple. """ - ret = [] - for (filename, lineno, name, frame_globals, func_start_lineno) in stack: - linecache.checkcache(filename) - line = linecache.getline(filename, lineno, frame_globals) - if line: - line = line.strip() - else: - line = None - if include_func_start_lineno: - ret.append((filename, lineno, name, line, func_start_lineno)) - else: - ret.append((filename, lineno, name, line)) - return ret + def _tuple_generator(): # pylint: disable=missing-docstring + for (filename, lineno, name, frame_globals, func_start_lineno) in stack: + linecache.checkcache(filename) + line = linecache.getline(filename, lineno, frame_globals) + if line: + line = line.strip() + else: + line = None + if include_func_start_lineno: + yield (filename, lineno, name, line, func_start_lineno) + else: + yield (filename, lineno, name, line) + + return tuple(_tuple_generator()) From 710d3113bf63558aa8a0faccab9cdb562052692e Mon Sep 17 00:00:00 2001 From: Rick Chao Date: Mon, 22 Jul 2019 13:23:08 -0700 Subject: [PATCH 0329/3053] Unwrap `initial_value` if it is a `CheckpointInitialValue` in collective_all_reduce_strategy's `initial_value_fn`. This fixes a bug where running keras_mnist_multi_worker with eager causes seg fault. PiperOrigin-RevId: 259393313 --- .../python/distribute/collective_all_reduce_strategy.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py index e35f95a0331..c43d28b0226 100644 --- a/tensorflow/python/distribute/collective_all_reduce_strategy.py +++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py @@ -40,6 +40,7 @@ from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import collective_ops from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.training.tracking import base as trackable from tensorflow.python.util.tf_export import tf_export @@ -335,6 +336,11 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended): if self._num_workers > 1: if self._is_chief: + # Unwrap `initial_value` if it is a `CheckpointInitialValue`. + # TODO(b/138130844): Revert the following check once + # `CheckpointInitialValue` class is removed. + if isinstance(initial_value, trackable.CheckpointInitialValue): + initial_value = initial_value.wrapped_value bcast_send = collective_ops.broadcast_send( initial_value, initial_value.shape, initial_value.dtype, group_size, group_key, collective_instance_key) From bf7368f7a02db5055de09be13ac3ba0143749598 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 13:25:39 -0700 Subject: [PATCH 0330/3053] Return eager tensors from the training_eager.* methods instead of numpy scalars. This also moves the conversion to numpy() to the end of the dist strat strategy execution function in the v2 loops. PiperOrigin-RevId: 259393774 --- tensorflow/python/keras/engine/training.py | 18 ++++++++++++++++-- .../python/keras/engine/training_eager.py | 9 ++------- .../python/keras/engine/training_v2_utils.py | 19 +++++++++---------- 3 files changed, 27 insertions(+), 19 deletions(-) diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py index cdc06daae6a..718f3a582cf 100644 --- a/tensorflow/python/keras/engine/training.py +++ b/tensorflow/python/keras/engine/training.py @@ -946,9 +946,14 @@ class Model(network.Network): ValueError: In case of invalid user-provided arguments. """ if self._run_distributed: - return training_v2_utils.train_on_batch( + outputs = training_v2_utils.train_on_batch( self, x, y=y, sample_weight=sample_weight, class_weight=class_weight, reset_metrics=reset_metrics) + outputs = [ + training_v2_utils._non_none_constant_value(v) for v in outputs] # pylint: disable=protected-access + if len(outputs) == 1: + outputs = outputs[0] + return outputs self._assert_compile_was_called() # If at this point we are in the replica context, then it is okay to execute @@ -974,6 +979,8 @@ class Model(network.Network): y, sample_weights=sample_weights, output_loss_metrics=self._output_loss_metrics) + outputs = [ + training_v2_utils._non_none_constant_value(v) for v in outputs] # pylint: disable=protected-access else: x = training_utils.ModelInputs(x).as_list() ins = x + (y or []) + (sample_weights or []) @@ -1031,9 +1038,14 @@ class Model(network.Network): ValueError: In case of invalid user-provided arguments. """ if self._run_distributed: - return training_v2_utils.test_on_batch( + outputs = training_v2_utils.test_on_batch( self, x, y=y, sample_weight=sample_weight, reset_metrics=reset_metrics) + outputs = [ + training_v2_utils._non_none_constant_value(v) for v in outputs] # pylint: disable=protected-access + if len(outputs) == 1: + outputs = outputs[0] + return outputs self._assert_compile_was_called() if (self._distribution_strategy and @@ -1053,6 +1065,8 @@ class Model(network.Network): y, sample_weights=sample_weights, output_loss_metrics=self._output_loss_metrics) + outputs = [ + training_v2_utils._non_none_constant_value(v) for v in outputs] # pylint: disable=protected-access else: x = training_utils.ModelInputs(x).as_list() inputs = x + (y or []) + (sample_weights or []) diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py index 6cbc6851a8e..c019238f48e 100644 --- a/tensorflow/python/keras/engine/training_eager.py +++ b/tensorflow/python/keras/engine/training_eager.py @@ -307,12 +307,7 @@ def train_on_batch(model, total_loss = nest.flatten(total_loss) results = total_loss + output_losses + metrics_results - return [_non_none_constant_value(v) for v in results] - - -def _non_none_constant_value(v): - constant_value = tensor_util.constant_value(v) - return constant_value if constant_value is not None else v + return results def test_on_batch(model, @@ -365,4 +360,4 @@ def test_on_batch(model, total_loss = nest.flatten(total_loss) results = total_loss + output_losses + metrics_results - return [_non_none_constant_value(v) for v in results] + return results diff --git a/tensorflow/python/keras/engine/training_v2_utils.py b/tensorflow/python/keras/engine/training_v2_utils.py index 982ef2a71a1..e609559e5e8 100644 --- a/tensorflow/python/keras/engine/training_v2_utils.py +++ b/tensorflow/python/keras/engine/training_v2_utils.py @@ -70,19 +70,22 @@ def _make_execution_function(model, mode): strategy, outputs, with_loss_tensor=(mode != ModeKeys.PREDICT)) return all_outputs - if model.run_eagerly: - execution_function = distributed_function - else: + if not model.run_eagerly: distributed_function = def_function.function( distributed_function, autograph=False) - def execution_function(input_fn): - # `numpy` translates Tensors to values in Eager mode. - return [out.numpy() for out in distributed_function(input_fn)] + def execution_function(input_fn): + # `numpy` translates Tensors to values in Eager mode. + return [out.numpy() for out in distributed_function(input_fn)] return execution_function +def _non_none_constant_value(v): + constant_value = tensor_util.constant_value(v) + return constant_value if constant_value is not None else v + + def _prepare_feed_values(model, inputs, mode): """Prepare feed values to the model execution function. @@ -232,8 +235,6 @@ def train_on_batch( if reset_metrics: model.reset_metrics() - if len(outputs) == 1: - return outputs[0] return outputs @@ -295,8 +296,6 @@ def test_on_batch(model, x, y=None, sample_weight=None, reset_metrics=True): if reset_metrics: model.reset_metrics() - if len(outputs) == 1: - return outputs[0] return outputs From df3fd29cfbb7e5bd655834e0162a804f9d5a5f19 Mon Sep 17 00:00:00 2001 From: Anna R Date: Mon, 22 Jul 2019 13:31:57 -0700 Subject: [PATCH 0331/3053] Switch tf_upgrade_v2 target to python_version = "PY3" instead of python_version = "PY2". PiperOrigin-RevId: 259395034 --- tensorflow/tools/compatibility/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD index c4fc1a993df..36efc6bf695 100644 --- a/tensorflow/tools/compatibility/BUILD +++ b/tensorflow/tools/compatibility/BUILD @@ -138,7 +138,7 @@ py_binary( name = "tf_upgrade_v2", srcs = ["tf_upgrade_v2_main.py"], main = "tf_upgrade_v2_main.py", - python_version = "PY2", + python_version = "PY3", srcs_version = "PY2AND3", deps = [ ":ast_edits", From d816bfcd707751f59672be2045471f4e662f849f Mon Sep 17 00:00:00 2001 From: Karthik Muthuraman Date: Mon, 22 Jul 2019 13:49:08 -0700 Subject: [PATCH 0332/3053] Regen API golden for reciprocal_no_nan(). --- tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt | 4 ++++ tensorflow/tools/api/golden/v1/tensorflow.pbtxt | 4 ++++ tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt | 4 ++++ tensorflow/tools/api/golden/v2/tensorflow.pbtxt | 4 ++++ 4 files changed, 16 insertions(+) diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt index 1fd765a5f81..2180cd87cc2 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt @@ -308,6 +308,10 @@ tf_module { name: "reciprocal" argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } + member_method { + name: "reciprocal_no_nan" + argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } member_method { name: "reduce_all" argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt index 178daad4a2a..294efc75ed3 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt @@ -1856,6 +1856,10 @@ tf_module { name: "reciprocal" argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } + member_method { + name: "reciprocal_no_nan" + argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } member_method { name: "recompute_grad" argspec: "args=[\'f\'], varargs=None, keywords=None, defaults=None" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt index 3ec5c656b3f..5c9ba42b801 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt @@ -308,6 +308,10 @@ tf_module { name: "reciprocal" argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } + member_method { + name: "reciprocal_no_nan" + argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } member_method { name: "reduce_all" argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt index 33c4610d97b..a56e7d0dbe9 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt @@ -860,6 +860,10 @@ tf_module { name: "realdiv" argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } + member_method { + name: "reciprocal_no_nan" + argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } member_method { name: "recompute_grad" argspec: "args=[\'f\'], varargs=None, keywords=None, defaults=None" From 3c5fb53765056f1c83544ea3633a8343ab55224d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 13:36:50 -0700 Subject: [PATCH 0333/3053] Enable disabled test. PiperOrigin-RevId: 259396122 --- tensorflow/python/distribute/BUILD | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD index 79d3b126806..91edc480673 100644 --- a/tensorflow/python/distribute/BUILD +++ b/tensorflow/python/distribute/BUILD @@ -1123,7 +1123,6 @@ distribute_py_test( tags = [ "no_oss", # TODO(b/135287893) reenable "no_rocm", - "notap", # TODO(b/137972256) Re-enable this test. ], deps = [ ":saved_model_test_base", From 7cc180f107f142432358ac33787466de90afd776 Mon Sep 17 00:00:00 2001 From: Thomas O'Malley Date: Mon, 22 Jul 2019 13:46:02 -0700 Subject: [PATCH 0334/3053] Fix JSON serialization error in TensorFlowOpLayer in Python 3. PiperOrigin-RevId: 259397921 --- tensorflow/python/keras/engine/base_layer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py index 5663ff16745..c26bf5b79f3 100644 --- a/tensorflow/python/keras/engine/base_layer.py +++ b/tensorflow/python/keras/engine/base_layer.py @@ -2387,6 +2387,8 @@ class TensorFlowOpLayer(Layer): dtype=None): super(TensorFlowOpLayer, self).__init__( name=_TF_OP_LAYER_NAME_PREFIX + name, trainable=trainable, dtype=dtype) + if not isinstance(node_def, bytes): + node_def = node_def.encode('utf-8') self.node_def = node_def_pb2.NodeDef.FromString(node_def) self.constants = constants or {} # Layer uses original op unless it is called on new inputs. @@ -2446,7 +2448,7 @@ class TensorFlowOpLayer(Layer): def get_config(self): config = super(TensorFlowOpLayer, self).get_config() config.update({ - 'node_def': self.node_def.SerializeToString(), + 'node_def': self.node_def.SerializeToString().decode('utf-8'), 'constants': { i: backend.get_value(c) for i, c in self.constants.items() } From e5b12c6ce34335ff386101548323cb4801f04296 Mon Sep 17 00:00:00 2001 From: Pavithra Vijay Date: Mon, 22 Jul 2019 13:48:23 -0700 Subject: [PATCH 0335/3053] Fix invalid `steps` argument usage test for single execution path. In multiple execution path code, in eager mode we raised an error but otherwise we just raised a warning message. Updated the test case to check for a warning message for all use cases in single execution path. PiperOrigin-RevId: 259398447 --- tensorflow/python/keras/engine/training_test.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py index 9f020221322..aeec0264b92 100644 --- a/tensorflow/python/keras/engine/training_test.py +++ b/tensorflow/python/keras/engine/training_test.py @@ -1412,7 +1412,7 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase): run_distributed=testing_utils.should_run_distributed()) err_msg = 'When passing input data as arrays, do not specify' - if testing_utils.should_run_eagerly(): + if testing_utils.should_run_eagerly() and not model._run_distributed: with self.assertRaisesRegex(ValueError, err_msg): model.fit(x=np.zeros((100, 1)), y=np.ones((100, 1)), steps_per_epoch=4) @@ -1423,15 +1423,12 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase): model.predict(np.zeros((100, 1)), steps=4) else: with test.mock.patch.object(logging, 'warning') as mock_log: - model.fit(x=np.zeros((100, 1)), y=np.ones((100, 1)), steps_per_epoch=4) - self.assertRegexpMatches(str(mock_log.call_args), err_msg) - - with test.mock.patch.object(logging, 'warning') as mock_log: - model.evaluate(x=np.zeros((100, 1)), y=np.ones((100, 1)), steps=4) - self.assertRegexpMatches(str(mock_log.call_args), err_msg) - - with test.mock.patch.object(logging, 'warning') as mock_log: - model.predict(np.zeros((100, 1)), steps=4) + model._standardize_user_data( + np.zeros((100, 1)), + np.ones((100, 1)), + batch_size=25, + check_steps=True, + steps=4) self.assertRegexpMatches(str(mock_log.call_args), err_msg) From e547d262a54e075d8728454089404d36b58eb3a8 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Mon, 22 Jul 2019 14:14:08 -0700 Subject: [PATCH 0336/3053] [XLA:CPU] When emitting an elemental F16 conv, do the accumulation in F32 This matches what cuBlas or Eigen are doing and gives better precision for F16 convolutions. PiperOrigin-RevId: 259403856 --- tensorflow/compiler/xla/service/cpu/ir_emitter.cc | 11 +++++++---- tensorflow/compiler/xla/tests/convolution_test.cc | 4 ---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc index 1509da6f7ec..ceaeacbea2a 100644 --- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc +++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc @@ -1027,10 +1027,13 @@ StatusOr IrEmitter::EmitElementalConvolution( PrimitiveType lhs_element_type = lhs->shape().element_type(); llvm::Type* lhs_llvm_type = llvm_ir::PrimitiveTypeToIrType(lhs_element_type, module_); + // Upcast the accumulator to F32 from F16 for increased precision. + llvm::Type* accumulator_type = + lhs_element_type == F16 ? b_.getFloatTy() : lhs_llvm_type; llvm::Value* sum_address = llvm_ir::EmitAllocaAtFunctionEntry( - lhs_llvm_type, "convolution_sum_address", &b_, + accumulator_type, "convolution_sum_address", &b_, MinimumAlignmentForPrimitiveType(lhs_element_type)); - llvm::Value* constant_zero = llvm::Constant::getNullValue(lhs_llvm_type); + llvm::Value* constant_zero = llvm::Constant::getNullValue(accumulator_type); Store(constant_zero, sum_address); llvm_ir::ForLoopNest loops(IrName(convolution, "inner"), &b_); @@ -1139,11 +1142,11 @@ StatusOr IrEmitter::EmitElementalConvolution( TF_ASSIGN_OR_RETURN(llvm::Value* const kernel_value, kernel_generator(kernel_index)); llvm::Value* product = FMul(input_value, kernel_value); - llvm::Value* sum = FAdd(Load(sum_address), product); + llvm::Value* sum = FAdd(Load(sum_address), FPCast(product, accumulator_type)); Store(sum, sum_address); SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_); - return Load(sum_address); + return FPCast(Load(sum_address), lhs_llvm_type); } Status IrEmitter::HandleConvolution(HloInstruction* convolution) { diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc index 0ab765aefa0..0fae5d966db 100644 --- a/tensorflow/compiler/xla/tests/convolution_test.cc +++ b/tensorflow/compiler/xla/tests/convolution_test.cc @@ -1842,15 +1842,11 @@ INSTANTIATE_TEST_CASE_P( Convolve1DTestParam{130, 1, 1, 1, 3}, Convolve1DTestParam{64, 1, 1, 1, 1}, Convolve1DTestParam{128, 1, 1, 1, 1}, -// TODO(b/72566306): The following five tests failed on CPU with unreasonable -// relative errors. Last ran on 2018-02-22. -#if XLA_TEST_BACKEND_GPU Convolve1DTestParam{139, 1, 1, 128, 1}, Convolve1DTestParam{640, 3, 3, 128, 1}, Convolve1DTestParam{900, 1, 1, 10, 1}, Convolve1DTestParam{1, 10, 10, 1, 10}, Convolve1DTestParam{1, 10, 130, 1, 1}, -#endif Convolve1DTestParam{1, 10, 130, 1, 2}, Convolve1DTestParam{1, 64, 64, 1, 10}, Convolve1DTestParam{1, 65, 65, 1, 1}, From 200c062c11968feb42a27a55c0a48940f02095ea Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Mon, 22 Jul 2019 14:20:04 -0700 Subject: [PATCH 0337/3053] Address the comments --- .../auto_shard_dataset_op_test.cc | 219 +++++++++--------- 1 file changed, 110 insertions(+), 109 deletions(-) diff --git a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc index 33546416e56..828561a86de 100644 --- a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc +++ b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc @@ -49,14 +49,29 @@ class AutoShardDatasetOpTest : public DatasetOpsTestBase { } }; -struct RangeDatasetParams { - int64 start; - int64 stop; - int64 step; -}; - struct TestCase { - RangeDatasetParams range_dataset_param; + TestCase(int64 start, int64 stop, int64 step, int64 num_workers, int64 index, + std::vector expected_outputs, + DataTypeVector expected_output_dtypes, + std::vector expected_output_shapes, + int64 expected_cardinality, std::vector breakpoints) + : start( + DatasetOpsTestBase::CreateTensor(TensorShape({}), {start})), + stop(DatasetOpsTestBase::CreateTensor(TensorShape({}), {stop})), + step(DatasetOpsTestBase::CreateTensor(TensorShape({}), {step})), + num_workers(DatasetOpsTestBase::CreateTensor(TensorShape({}), + {num_workers})), + index( + DatasetOpsTestBase::CreateTensor(TensorShape({}), {index})), + expected_outputs(std::move(expected_outputs)), + expected_output_dtypes(std::move(expected_output_dtypes)), + expected_output_shapes(std::move(expected_output_shapes)), + expected_cardinality(expected_cardinality), + breakpoints(std::move(breakpoints)) {} + + Tensor start; + Tensor stop; + Tensor step; Tensor num_workers; Tensor index; std::vector expected_outputs; @@ -67,105 +82,105 @@ struct TestCase { }; // Test Case 1: simple case. -TestCase TestCase1() { - return {/*range_data_param*/ {0, 10, 1}, - /*num_workers*/ - DatasetOpsTestBase::CreateTensor(TensorShape({}), {5}), - /*index*/ - DatasetOpsTestBase::CreateTensor(TensorShape({}), {2}), - /*expected_outputs*/ +TestCase SimpleCase() { + return {/*start=*/0, + /*stop=*/10, + /*step=*/1, + /*num_workers=*/5, + /*index=*/2, + /*expected_outputs=*/ {DatasetOpsTestBase::CreateTensor(TensorShape({}), {2}), DatasetOpsTestBase::CreateTensor(TensorShape({}), {7})}, - /*expected_output_dtypes*/ {DT_INT64}, - /*expected_output_shapes*/ {PartialTensorShape({})}, - /*expected_cardinality*/ 2, - /*breakpoints*/ {0, 1, 5}}; + /*expected_output_dtypes=*/{DT_INT64}, + /*expected_output_shapes=*/{PartialTensorShape({})}, + /*expected_cardinality=*/2, + /*breakpoints=*/{0, 1, 5}}; } // Test Case 2: the index is larger than the available elements. -TestCase TestCase2() { - return {/*range_data_param*/ {0, 1, 1}, - /*num_workers*/ - DatasetOpsTestBase::CreateTensor(TensorShape({}), {5}), - /*index*/ - DatasetOpsTestBase::CreateTensor(TensorShape({}), {2}), - /*expected_outputs*/ {}, - /*expected_output_dtypes*/ {DT_INT64}, - /*expected_output_shapes*/ {PartialTensorShape({})}, - /*expected_cardinality*/ 0, - /*breakpoints*/ {0, 1}}; +TestCase IndexLargerThanAvailableElementsCase() { + return {/*start=*/0, + /*stop=*/1, + /*step=*/1, + /*num_workers=*/5, + /*index=*/2, + /*expected_outputs=*/{}, + /*expected_output_dtypes=*/{DT_INT64}, + /*expected_output_shapes=*/{PartialTensorShape({})}, + /*expected_cardinality=*/2, + /*breakpoints=*/{0, 1}}; } // Test Case 3: the number of outputs could not be evenly divided by // num_workers. -TestCase TestCase3() { - return {/*range_data_param*/ {0, 10, 1}, - /*num_workers*/ - DatasetOpsTestBase::CreateTensor(TensorShape({}), {4}), - /*index*/ - DatasetOpsTestBase::CreateTensor(TensorShape({}), {3}), - /*expected_outputs*/ +TestCase ElementsUnequallyDividedCase() { + return {/*start=*/0, + /*stop=*/10, + /*step=*/1, + /*num_workers=*/4, + /*index=*/3, + /*expected_outputs=*/ {DatasetOpsTestBase::CreateTensor(TensorShape({}), {3}), DatasetOpsTestBase::CreateTensor(TensorShape({}), {7})}, - /*expected_output_dtypes*/ {DT_INT64}, - /*expected_output_shapes*/ {PartialTensorShape({})}, - /*expected_cardinality*/ 2, - /*breakpoints*/ {0, 1, 5}}; + /*expected_output_dtypes=*/{DT_INT64}, + /*expected_output_shapes=*/{PartialTensorShape({})}, + /*expected_cardinality=*/2, + /*breakpoints=*/{0, 1, 5}}; } // TODO(feihugis): add more test cases that have ReaderDatasets (e.g. a // CSVDataset or a TFRecordDataset) in the pipeline. TestCase IndexGreaterNumWorkersCase() { - return {/*range_data_param*/ {0, 10, 1}, - /*num_workers*/ - DatasetOpsTestBase::CreateTensor(TensorShape({}), {5}), - /*index*/ - DatasetOpsTestBase::CreateTensor(TensorShape({}), {7}), - /*expected_outputs*/ {}, - /*expected_output_dtypes*/ {DT_INT64}, - /*expected_output_shapes*/ {PartialTensorShape({})}, - /*expected_cardinality*/ 0, - /*breakpoints*/ {}}; + return {/*start=*/0, + /*stop=*/10, + /*step=*/1, + /*num_workers=*/5, + /*index=*/7, + /*expected_outputs=*/{}, + /*expected_output_dtypes=*/{DT_INT64}, + /*expected_output_shapes=*/{PartialTensorShape({})}, + /*expected_cardinality=*/0, + /*breakpoints=*/{}}; } TestCase NegativeIndexTestCase() { - return {/*range_data_param*/ {0, 10, 1}, - /*num_workers*/ - DatasetOpsTestBase::CreateTensor(TensorShape({}), {5}), - /*index*/ - DatasetOpsTestBase::CreateTensor(TensorShape({}), {-3}), - /*expected_outputs*/ {}, - /*expected_output_dtypes*/ {DT_INT64}, - /*expected_output_shapes*/ {PartialTensorShape({})}, - /*expected_cardinality*/ 0, - /*breakpoints*/ {}}; + return {/*start=*/0, + /*stop=*/10, + /*step=*/1, + /*num_workers=*/5, + /*index=*/-3, + /*expected_outputs=*/{}, + /*expected_output_dtypes=*/{DT_INT64}, + /*expected_output_shapes=*/{PartialTensorShape({})}, + /*expected_cardinality=*/0, + /*breakpoints=*/{}}; } TestCase NegativeNumWorkersTestCase() { - return {/*range_data_param*/ {0, 10, 1}, - /*num_workers*/ - DatasetOpsTestBase::CreateTensor(TensorShape({}), {-3}), - /*index*/ - DatasetOpsTestBase::CreateTensor(TensorShape({}), {1}), - /*expected_outputs*/ {}, - /*expected_output_dtypes*/ {DT_INT64}, - /*expected_output_shapes*/ {PartialTensorShape({})}, - /*expected_cardinality*/ 0, - /*breakpoints*/ {}}; + return {/*start=*/0, + /*stop=*/10, + /*step=*/1, + /*num_workers=*/-3, + /*index=*/1, + /*expected_outputs=*/{}, + /*expected_output_dtypes=*/{DT_INT64}, + /*expected_output_shapes=*/{PartialTensorShape({})}, + /*expected_cardinality=*/0, + /*breakpoints=*/{}}; } TestCase ZeroNumWorkersTestCase() { - return {/*range_data_param*/ {0, 10, 1}, - /*num_workers*/ - DatasetOpsTestBase::CreateTensor(TensorShape({}), {0}), - /*index*/ - DatasetOpsTestBase::CreateTensor(TensorShape({}), {1}), - /*expected_outputs*/ {}, - /*expected_output_dtypes*/ {DT_INT64}, - /*expected_output_shapes*/ {PartialTensorShape({})}, - /*expected_cardinality*/ 0, - /*breakpoints*/ {}}; + return {/*start=*/0, + /*stop=*/10, + /*step=*/1, + /*num_workers=*/0, + /*index=*/1, + /*expected_outputs=*/{}, + /*expected_output_dtypes=*/{DT_INT64}, + /*expected_output_shapes=*/{PartialTensorShape({})}, + /*expected_cardinality=*/0, + /*breakpoints=*/{}}; } class ParameterizedAutoShardDatasetOpTest @@ -183,21 +198,13 @@ TEST_P(ParameterizedAutoShardDatasetOpTest, GetNext) { test_case.expected_output_shapes, &auto_shard_dataset_kernel)); - Tensor start = CreateTensor(TensorShape({}), - {test_case.range_dataset_param.start}); - Tensor stop = CreateTensor(TensorShape({}), - {test_case.range_dataset_param.stop}); - Tensor step = CreateTensor(TensorShape({}), - {test_case.range_dataset_param.step}); Tensor range_dataset_tensor(DT_VARIANT, TensorShape({})); - TF_ASSERT_OK(MakeRangeDataset(start, stop, step, {DT_INT64}, - {TensorShape({})}, &range_dataset_tensor)); - - Tensor num_workers = test_case.num_workers; - Tensor index = test_case.index; - gtl::InlinedVector inputs({TensorValue(&range_dataset_tensor), - TensorValue(&num_workers), - TensorValue(&index)}); + TF_ASSERT_OK(MakeRangeDataset(test_case.start, test_case.stop, test_case.step, + {DT_INT64}, {TensorShape({})}, + &range_dataset_tensor)); + gtl::InlinedVector inputs( + {TensorValue(&range_dataset_tensor), TensorValue(&test_case.num_workers), + TensorValue(&test_case.index)}); std::unique_ptr auto_shard_dataset_context; TF_ASSERT_OK(CreateAutoShardDatasetContext( auto_shard_dataset_kernel.get(), &inputs, &auto_shard_dataset_context)); @@ -233,7 +240,9 @@ TEST_P(ParameterizedAutoShardDatasetOpTest, GetNext) { INSTANTIATE_TEST_SUITE_P(AutoShardDatasetOpTest, ParameterizedAutoShardDatasetOpTest, ::testing::ValuesIn(std::vector( - {TestCase1(), TestCase2(), TestCase3()}))); + {SimpleCase(), + IndexLargerThanAvailableElementsCase(), + ElementsUnequallyDividedCase()}))); TEST_F(AutoShardDatasetOpTest, InvalidArguments) { int thread_num = 2, cpu_num = 2; @@ -243,27 +252,19 @@ TEST_F(AutoShardDatasetOpTest, InvalidArguments) { std::vector test_cases = { IndexGreaterNumWorkersCase(), NegativeIndexTestCase(), NegativeNumWorkersTestCase(), ZeroNumWorkersTestCase()}; - for (const auto& test_case : test_cases) { + for (auto& test_case : test_cases) { std::unique_ptr auto_shard_dataset_kernel; TF_ASSERT_OK(CreateAutoShardDatasetOpKernel( test_case.expected_output_dtypes, test_case.expected_output_shapes, &auto_shard_dataset_kernel)); - Tensor start = CreateTensor(TensorShape({}), - {test_case.range_dataset_param.start}); - Tensor stop = CreateTensor(TensorShape({}), - {test_case.range_dataset_param.stop}); - Tensor step = CreateTensor(TensorShape({}), - {test_case.range_dataset_param.step}); Tensor range_dataset_tensor(DT_VARIANT, TensorShape({})); - TF_ASSERT_OK(MakeRangeDataset(start, stop, step, {DT_INT64}, - {TensorShape({})}, &range_dataset_tensor)); - - Tensor num_workers = test_case.num_workers; - Tensor index = test_case.index; + TF_ASSERT_OK(MakeRangeDataset(test_case.start, test_case.stop, + test_case.step, {DT_INT64}, {TensorShape({})}, + &range_dataset_tensor)); gtl::InlinedVector inputs( - {TensorValue(&range_dataset_tensor), TensorValue(&num_workers), - TensorValue(&index)}); + {TensorValue(&range_dataset_tensor), + TensorValue(&test_case.num_workers), TensorValue(&test_case.index)}); std::unique_ptr auto_shard_dataset_context; TF_ASSERT_OK(CreateAutoShardDatasetContext( auto_shard_dataset_kernel.get(), &inputs, &auto_shard_dataset_context)); From f8c912d2b70280752563f6bbbf626c5e5ea72b6a Mon Sep 17 00:00:00 2001 From: Zhenyu Tan Date: Mon, 22 Jul 2019 14:14:12 -0700 Subject: [PATCH 0338/3053] Add colocation back for optimizer v2. PiperOrigin-RevId: 259403870 --- tensorflow/python/keras/optimizer_v2/optimizer_v2.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py index 039e2b4cea7..f053d856dd3 100644 --- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py +++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py @@ -474,9 +474,12 @@ class OptimizerV2(trackable.Trackable): update_ops = [] with backend.name_scope(name or self._name): for grad, var in grads_and_vars: - scope_name = ("" if ops.executing_eagerly_outside_functions() else - "_" + var.op.name) - with backend.name_scope("update" + scope_name): + scope_name = ("update" if ops.executing_eagerly_outside_functions() else + "update_" + var.op.name) + # Colocate the update with variables to avoid unnecessary communication + # delays. See b/136304694. + with backend.name_scope( + scope_name), distribution.extended.colocate_vars_with(var): update_ops.extend( distribution.extended.update( var, apply_grad_to_update_var, args=(grad,), group=False)) From 1d29d5d79f60620527ca0ba34e75dc8b1018a95b Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Mon, 22 Jul 2019 14:31:42 -0700 Subject: [PATCH 0339/3053] [TF:XLA] Bump open source llvm revision to r366675 PiperOrigin-RevId: 259407441 --- tensorflow/workspace.bzl | 8 ++++---- third_party/llvm/llvm.autogenerated.BUILD | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 0303a49982d..1cfe0a2b689 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -543,11 +543,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "llvm", build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"), - sha256 = "9257e111ae3d5b9d80925ef1329666440460abf4d052e701fa587f5236be6fcc", - strip_prefix = "llvm-df22a5e50a3d36a7b68eea106970dfa5df6d2453", + sha256 = "88012afcd6d8238430d39967b62e5599bc31d9c4cdc6d20281bedf1020b7000b", + strip_prefix = "llvm-b7d166cebcf619a3691eed3f994384aab3d80fa6", urls = [ - "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/df22a5e50a3d36a7b68eea106970dfa5df6d2453.tar.gz", - "https://github.com/llvm-mirror/llvm/archive/df22a5e50a3d36a7b68eea106970dfa5df6d2453.tar.gz", + "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/b7d166cebcf619a3691eed3f994384aab3d80fa6.tar.gz", + "https://github.com/llvm-mirror/llvm/archive/b7d166cebcf619a3691eed3f994384aab3d80fa6.tar.gz", ], ) diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD index 2eb65ae68b5..400326276e8 100644 --- a/third_party/llvm/llvm.autogenerated.BUILD +++ b/third_party/llvm/llvm.autogenerated.BUILD @@ -472,6 +472,7 @@ cc_library( ":selection_dag", ":support", ":target", + ":transform_utils", ], ) From 7809c78de61774bdc9a5c8d61aea757934469339 Mon Sep 17 00:00:00 2001 From: Rajeshwar Reddy T <43972606+rthadur@users.noreply.github.com> Date: Mon, 22 Jul 2019 14:50:31 -0700 Subject: [PATCH 0340/3053] Update CONTRIBUTING.md --- CONTRIBUTING.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 4ed8a8bf2b2..f6b81afa59d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -29,7 +29,7 @@ Follow either of the two links above to access the appropriate CLA and instructi ### Contributing code If you have improvements to TensorFlow, send us your pull requests! For those -just getting started, Github has a [howto](https://help.github.com/articles/using-pull-requests/). +just getting started, Github has a [how to](https://help.github.com/articles/using-pull-requests/). TensorFlow team members will be assigned to review your pull requests. Once the pull requests are approved and pass continuous integration checks, a TensorFlow From ed2d1fe63dfb3a4ddd619d1a27acfd48c6408464 Mon Sep 17 00:00:00 2001 From: Sundeep Gottipati Date: Mon, 22 Jul 2019 14:41:43 -0700 Subject: [PATCH 0341/3053] Implement __gt__ method on FeatureColumn base class so that they are sortable in Python 3. PiperOrigin-RevId: 259409662 --- .../python/feature_column/feature_column.py | 29 +++++++++++++++-- .../feature_column/feature_column_v2.py | 31 +++++++++++++++++-- .../feature_column/feature_column_v2_test.py | 2 +- 3 files changed, 55 insertions(+), 7 deletions(-) diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py index 640561f4995..7445556d421 100644 --- a/tensorflow/python/feature_column/feature_column.py +++ b/tensorflow/python/feature_column/feature_column.py @@ -1758,12 +1758,15 @@ class _FeatureColumn(object): pass def __lt__(self, other): - """Allows feature columns to be sortable in Python 3 as they are in 2. + """Allows feature columns to be sorted in Python 3 as they are in Python 2. Feature columns need to occasionally be sortable, for example when used as keys in a features dictionary passed to a layer. - `__lt__` is the only method needed for sorting in CPython: + In CPython, `__lt__` must be defined for all objects in the + sequence being sorted. If any objects do not have an `__lt__` compatible + with feature column objects (such as strings), then CPython will fall back + to using the `__gt__` method below. https://docs.python.org/3/library/stdtypes.html#list.sort Args: @@ -1772,10 +1775,30 @@ class _FeatureColumn(object): Returns: True if the string representation of this object is lexicographically less than the string representation of `other`. For FeatureColumn objects, - this looks like "<__main__.FeatureColumn object at 0x7fa1fc02bba8>". + this looks like "<__main__.FeatureColumn object at 0xa>". """ return str(self) < str(other) + def __gt__(self, other): + """Allows feature columns to be sorted in Python 3 as they are in Python 2. + + Feature columns need to occasionally be sortable, for example when used as + keys in a features dictionary passed to a layer. + + `__gt__` is called when the "other" object being compared during the sort + does not have `__lt__` defined. + Example: http://gpaste/4803354716798976 + + Args: + other: The other object to compare to. + + Returns: + True if the string representation of this object is lexicographically + greater than the string representation of `other`. For FeatureColumn + objects, this looks like "<__main__.FeatureColumn object at 0xa>". + """ + return str(self) > str(other) + @property def _var_scope_name(self): """Returns string. Used for variable_scope. Defaults to self.name.""" diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py index 96a08141076..d232565a6b3 100644 --- a/tensorflow/python/feature_column/feature_column_v2.py +++ b/tensorflow/python/feature_column/feature_column_v2.py @@ -2198,12 +2198,17 @@ class FeatureColumn(object): pass def __lt__(self, other): - """Allows feature columns to be sortable in Python 3 as they are in 2. + """Allows feature columns to be sorted in Python 3 as they are in Python 2. Feature columns need to occasionally be sortable, for example when used as keys in a features dictionary passed to a layer. - `__lt__` is the only method needed for sorting in CPython: + In CPython, `__lt__` must be defined for all objects in the + sequence being sorted. + + If any objects in teh sequence being sorted do not have an `__lt__` method + compatible with feature column objects (such as strings), then CPython will + fall back to using the `__gt__` method below. https://docs.python.org/3/library/stdtypes.html#list.sort Args: @@ -2212,10 +2217,30 @@ class FeatureColumn(object): Returns: True if the string representation of this object is lexicographically less than the string representation of `other`. For FeatureColumn objects, - this looks like "<__main__.FeatureColumn object at 0x7fa1fc02bba8>". + this looks like "<__main__.FeatureColumn object at 0xa>". """ return str(self) < str(other) + def __gt__(self, other): + """Allows feature columns to be sorted in Python 3 as they are in Python 2. + + Feature columns need to occasionally be sortable, for example when used as + keys in a features dictionary passed to a layer. + + `__gt__` is called when the "other" object being compared during the sort + does not have `__lt__` defined. + Example: http://gpaste/4803354716798976 + + Args: + other: The other object to compare to. + + Returns: + True if the string representation of this object is lexicographically + greater than the string representation of `other`. For FeatureColumn + objects, this looks like "<__main__.FeatureColumn object at 0xa>". + """ + return str(self) > str(other) + @abc.abstractmethod def transform_feature(self, transformation_cache, state_manager): """Returns intermediate representation (usually a `Tensor`). diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py index 528f8fec83e..3391badb4e9 100644 --- a/tensorflow/python/feature_column/feature_column_v2_test.py +++ b/tensorflow/python/feature_column/feature_column_v2_test.py @@ -99,7 +99,7 @@ class SortableFeatureColumnTest(test.TestCase): a = fc.numeric_column('first') # '<__main__.NumericColumn object at 0xa>' b = fc.numeric_column('second') # '<__main__.NumericColumn object at 0xb>' c = fc_old._numeric_column('third') # '<__main__._NumericColumn ...>' - self.assertAllEqual(sorted(['d', c, b, a]), [a, b, c, 'd']) + self.assertAllEqual(sorted(['d', c, b, a, '0']), ['0', a, b, c, 'd']) class LazyColumnTest(test.TestCase): From 98f6e0e82b1827cc2b16da16225315d5986094e3 Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Mon, 22 Jul 2019 14:45:42 -0700 Subject: [PATCH 0342/3053] Update broken colabs to follow the latest practices. PiperOrigin-RevId: 259410575 --- .../examples/notebooks/algorithms.ipynb | 1701 ++++++----------- 1 file changed, 562 insertions(+), 1139 deletions(-) diff --git a/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb b/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb index bf824e2760e..c51d2124920 100644 --- a/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb +++ b/tensorflow/contrib/autograph/examples/notebooks/algorithms.ipynb @@ -18,18 +18,29 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "TuWj26KWz1fZ" }, "outputs": [], "source": [ - "!pip install -U -q tf-nightly" + "!pip install -U -q tf-nightly-2.0-preview" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "colab": {}, + "colab_type": "code", + "id": "Cp7iTarmz62Y" + }, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "\n", + "tf = tf.compat.v2\n", + "tf.enable_v2_behavior()" ] }, { @@ -41,25 +52,21 @@ "source": [ "### Fibonacci numbers\n", "\n", - "https://en.wikipedia.org/wiki/Fibonacci_number" + "https://en.wikipedia.org/wiki/Fibonacci_number\n" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 11, "metadata": { "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - }, - "height": 197 + "height": 187 }, "colab_type": "code", "executionInfo": { - "elapsed": 7512, + "elapsed": 709, "status": "ok", - "timestamp": 1532101577266, + "timestamp": 1563825398552, "user": { "displayName": "", "photoUrl": "", @@ -68,7 +75,7 @@ "user_tz": 240 }, "id": "H7olFlMXqrHe", - "outputId": "472dbfe0-9449-4f93-e908-1a0785188a92" + "outputId": "25243e7b-99a7-4a6d-ad00-e97c52be7d97" }, "outputs": [ { @@ -89,25 +96,19 @@ } ], "source": [ - "import tensorflow as tf\n", - "from tensorflow.contrib import autograph as ag\n", - "\n", - "\n", + "@tf.function\n", "def fib(n):\n", " f1 = 0\n", " f2 = 1\n", - " for i in range(n):\n", + " for i in tf.range(n):\n", " tmp = f2\n", " f2 = f2 + f1\n", " f1 = tmp\n", - " print(i, ': ', f2)\n", + " tf.print(i, ': ', f2)\n", " return f2\n", "\n", "\n", - "with tf.Graph().as_default():\n", - " final_fib = ag.to_graph(fib)(tf.constant(10))\n", - " with tf.Session() as sess:\n", - " sess.run(final_fib)" + "_ = fib(tf.constant(10))" ] }, { @@ -122,68 +123,15 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - }, - "height": 541 - }, + "colab": {}, "colab_type": "code", - "executionInfo": { - "elapsed": 103, - "status": "ok", - "timestamp": 1532101577412, - "user": { - "displayName": "", - "photoUrl": "", - "userId": "" - }, - "user_tz": 240 - }, - "id": "UeWjK8rHq6Cj", - "outputId": "73ece895-12fb-489a-e52c-032945d7ed7a" + "id": "UeWjK8rHq6Cj" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "from __future__ import print_function\n", - "import tensorflow as tf\n", - "\n", - "def tf__fib(n):\n", - " try:\n", - " with tf.name_scope('fib'):\n", - " f1 = 0\n", - " f2 = 1\n", - "\n", - " def extra_test(f1_1, f2_1):\n", - " with tf.name_scope('extra_test'):\n", - " return True\n", - "\n", - " def loop_body(i, f1_1, f2_1):\n", - " with tf.name_scope('loop_body'):\n", - " tmp = f2_1\n", - " f2_1 = f2_1 + f1_1\n", - " f1_1 = tmp\n", - " with ag__.utils.control_dependency_on_returns(ag__.utils.\n", - " dynamic_print(i, ': ', f2_1)):\n", - " f2, i_1 = ag__.utils.alias_tensors(f2_1, i)\n", - " return f1_1, f2\n", - " f1, f2 = ag__.for_stmt(ag__.utils.dynamic_builtin(range, n),\n", - " extra_test, loop_body, (f1, f2))\n", - " return f2\n", - " except:\n", - " ag__.rewrite_graph_construction_error(ag_source_map__)\n", - "\n" - ] - } - ], + "outputs": [], "source": [ - "print(ag.to_code(fib))" + "print(tf.autograph.to_code(fib.python_function))" ] }, { @@ -200,20 +148,16 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 12, "metadata": { "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - }, - "height": 125 + "height": 119 }, "colab_type": "code", "executionInfo": { - "elapsed": 233, + "elapsed": 663, "status": "ok", - "timestamp": 1532101577681, + "timestamp": 1563825401385, "user": { "displayName": "", "photoUrl": "", @@ -222,7 +166,7 @@ "user_tz": 240 }, "id": "33CAheYsrEQ7", - "outputId": "82a493ee-15b5-419d-8c9c-5f4159090a05" + "outputId": "2a88b65d-4fed-4d96-8770-0c68ffece861" }, "outputs": [ { @@ -240,8 +184,9 @@ ], "source": [ "import tensorflow as tf\n", - "from tensorflow.contrib import autograph as ag\n", "\n", + "\n", + "@tf.function(experimental_autograph_options=tf.autograph.experimental.Feature.EQUALITY_OPERATORS)\n", "def fizzbuzz(i, n):\n", " while i \u003c n:\n", " msg = ''\n", @@ -251,14 +196,11 @@ " msg += 'Buzz'\n", " if msg == '':\n", " msg = tf.as_string(i)\n", - " print(msg)\n", + " tf.print(msg)\n", " i += 1\n", " return i\n", "\n", - "with tf.Graph().as_default():\n", - " final_i = ag.to_graph(fizzbuzz)(tf.constant(10), tf.constant(16))\n", - " with tf.Session() as sess:\n", - " sess.run(final_i)" + "_ = fizzbuzz(tf.constant(10), tf.constant(16))" ] }, { @@ -273,98 +215,15 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - }, - "height": 1081 - }, + "colab": {}, "colab_type": "code", - "executionInfo": { - "elapsed": 289, - "status": "ok", - "timestamp": 1532101578003, - "user": { - "displayName": "", - "photoUrl": "", - "userId": "" - }, - "user_tz": 240 - }, - "id": "bBhFIIaZrxvx", - "outputId": "d076a7ea-e643-4689-f90a-57f5d086dedc" + "id": "bBhFIIaZrxvx" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "from __future__ import print_function\n", - "import tensorflow as tf\n", - "\n", - "def tf__fizzbuzz(i, n):\n", - " try:\n", - " with tf.name_scope('fizzbuzz'):\n", - "\n", - " def loop_test(i_1):\n", - " with tf.name_scope('loop_test'):\n", - " return tf.less(i_1, n)\n", - "\n", - " def loop_body(i_1):\n", - " with tf.name_scope('loop_body'):\n", - " msg = ''\n", - "\n", - " def if_true():\n", - " with tf.name_scope('if_true'):\n", - " msg_1, = msg,\n", - " msg_1 += 'Fizz'\n", - " return msg_1,\n", - "\n", - " def if_false():\n", - " with tf.name_scope('if_false'):\n", - " return msg,\n", - " msg = ag__.utils.run_cond(tf.equal(i_1 % 3, 0), if_true, if_false)\n", - "\n", - " def if_true_1():\n", - " with tf.name_scope('if_true_1'):\n", - " msg_2, = msg,\n", - " msg_2 += 'Buzz'\n", - " return msg_2,\n", - "\n", - " def if_false_1():\n", - " with tf.name_scope('if_false_1'):\n", - " return msg,\n", - " msg = ag__.utils.run_cond(tf.equal(i_1 % 5, 0), if_true_1, if_false_1\n", - " )\n", - "\n", - " def if_true_2():\n", - " with tf.name_scope('if_true_2'):\n", - " msg_3, = msg,\n", - " msg_3 = tf.as_string(i_1)\n", - " return msg_3,\n", - "\n", - " def if_false_2():\n", - " with tf.name_scope('if_false_2'):\n", - " return msg,\n", - " msg = ag__.utils.run_cond(tf.equal(msg, ''), if_true_2, if_false_2)\n", - " with ag__.utils.control_dependency_on_returns(ag__.utils.\n", - " dynamic_print(msg)):\n", - " msg_4 = ag__.utils.alias_tensors(msg)\n", - " i_1 += 1\n", - " return i_1,\n", - " i = ag__.while_stmt(loop_test, loop_body, (i,), (tf, n, ag__, i))\n", - " return i\n", - " except:\n", - " ag__.rewrite_graph_construction_error(ag_source_map__)\n", - "\n" - ] - } - ], + "outputs": [], "source": [ - "print(ag.to_code(fizzbuzz))" + "print(tf.autograph.to_code(fizzbuzz.python_function))" ] }, { @@ -393,12 +252,7 @@ "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "7moIlf8VABkl" }, @@ -414,44 +268,47 @@ "id": "QlEvfIQPAYF5" }, "source": [ - "#### Game of Life for AutoGraph" + "#### Game of Life for AutoGraph\n", + "\n", + "Note: the code may take a while to run." ] }, { "cell_type": "code", "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - } - }, + "colab": {}, "colab_type": "code", "id": "5pCK2qQSAAK4" }, "outputs": [], "source": [ "#@test {\"skip\": true} \n", - "NUM_STEPS = 100" + "NUM_STEPS = 75" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "colab_type": "text", + "id": "GPZANPdhMagD" + }, + "source": [ + "Note: This code uses a non-vectorized algorithm, which is quite slow. For 75 steps, it will take a few minutes to run. " ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": { "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - }, - "height": 308 + "height": 309 }, "colab_type": "code", "executionInfo": { - "elapsed": 14892, + "elapsed": 147654, "status": "ok", - "timestamp": 1532101593030, + "timestamp": 1563825336196, "user": { "displayName": "", "photoUrl": "", @@ -460,15 +317,15 @@ "user_tz": 240 }, "id": "hC3qMqryPDHS", - "outputId": "8405c0e9-e518-41d6-f5bc-e78df6474169" + "outputId": "56a095a3-28a3-455d-e95e-2c4c9dcd97d2" }, "outputs": [ { "data": { "text/html": [ - "\u003cvideo width=\"432.0\" height=\"288.0\" controls autoplay loop\u003e\n", - " \u003csource type=\"video/mp4\" src=\"data:video/mp4;base64,AAAAHGZ0eXBNNFYgAAACAGlzb21pc28yYXZjMQAAAAhmcmVlAACZUm1kYXQAAAKuBgX//6rcRem9\n", - "5tlIt5Ys2CDZI+7veDI2NCAtIGNvcmUgMTQ4IHIyNzk1IGFhYTlhYTggLSBILjI2NC9NUEVHLTQg\n", + "\u003cvideo width=\"432\" height=\"288\" controls autoplay loop\u003e\n", + " \u003csource type=\"video/mp4\" src=\"data:video/mp4;base64,AAAAHGZ0eXBNNFYgAAACAGlzb21pc28yYXZjMQAAAAhmcmVlAABdAG1kYXQAAAKuBgX//6rcRem9\n", + "5tlIt5Ys2CDZI+7veDI2NCAtIGNvcmUgMTUyIHIyODU0IGU5YTU5MDMgLSBILjI2NC9NUEVHLTQg\n", "QVZDIGNvZGVjIC0gQ29weWxlZnQgMjAwMy0yMDE3IC0gaHR0cDovL3d3dy52aWRlb2xhbi5vcmcv\n", "eDI2NC5odG1sIC0gb3B0aW9uczogY2FiYWM9MSByZWY9MyBkZWJsb2NrPTE6MDowIGFuYWx5c2U9\n", "MHgzOjB4MTEzIG1lPWhleCBzdWJtZT03IHBzeT0xIHBzeV9yZD0xLjAwOjAuMDAgbWl4ZWRfcmVm\n", @@ -479,725 +336,449 @@ "bWlkPTIgYl9hZGFwdD0xIGJfYmlhcz0wIGRpcmVjdD0xIHdlaWdodGI9MSBvcGVuX2dvcD0wIHdl\n", "aWdodHA9MiBrZXlpbnQ9MjUwIGtleWludF9taW49MTAgc2NlbmVjdXQ9NDAgaW50cmFfcmVmcmVz\n", "aD0wIHJjX2xvb2thaGVhZD00MCByYz1jcmYgbWJ0cmVlPTEgY3JmPTIzLjAgcWNvbXA9MC42MCBx\n", - "cG1pbj0wIHFwbWF4PTY5IHFwc3RlcD00IGlwX3JhdGlvPTEuNDAgYXE9MToxLjAwAIAAAAPQZYiE\n", - "ABH//veIHzLLafk613IR560urR9Q7kZxXqS9/iAAAAMAFpyZZ6/h5MpYA5/oqv4s2qPbYpW3jfK6\n", - "zQ6q7WMrNj7Hy8jZzmBpfHCwAAO1W4riBNsrapcCk+5V1W0XkkFULR4Qe+H3uGA2HgNW0zFAAUgt\n", - "W4tdpXv2OEg0Vuy5W5l/xGRmEGKDyeXyrM0S6q/1EKbad0x2mcHseUqNmeOGLy1N3b376XZKZcPY\n", - "IXC5F2332tNMj8CwOQiXM9PiCLyCVfZ3rQSkKBTZErkpS5kXUyoJG3FdIqLjRFKEapbUjcW64HIo\n", - "BeIbtRyWV9FyZfcTakx2KW3eB4ZI//MDykSe8CRgN76uBEqZFXwO63wmUREhHOb5AdaLV3xyGl/I\n", - "RV70rU/3t9t1aq5mFD3hy1aLTAV2U7nG072dyX87F7NgCxZHT2kFxu44fxf6gqVzE3PEbGr5fx9x\n", - "7TKXtmY53VP8UaeCd2HJiZ/sd165SutTnfiWvaLuCnmmXGF0AGqbj9S19kgOhTubZIJBydTTqQOV\n", - "YRlxbgKn2nzvunv9+NDG0/2ikyyp73W15QClmjyt8dUeynoN8CwtEQ59DdrAPZe4ARZTwWAfsRXw\n", - "1vcZ6Gr1nCNWllQw5IyZyxQtXrfc5p4wjPvGaltciG7d3FG1SGk6HDsZy5i/PsnkjRXLUvGbzYp2\n", - "2gs7ZSGfSJbEifctcMGeSqhOOYORKy6f/9omoieCVEEkniBXwWZ/eImb3nxF7SFIaBjgG2j9w5ut\n", - "BY6zSuQ5zRCdajzJ1loNO0havI8mp5yViAeAlLKYCxeK0Lha1FskL67W1YsARZVZ5EkhqAYEeTNI\n", - "M38Og48OXmj6QBN7c1b9uDUTacYEXO88ZQ1gCIREIMnm2Fgkir8pN4gtSeQ12sfOVz5x5KX7sa95\n", - "L4LyFQPDrFZcDBr4PWLeEEv8yzk0cYHE97GmAlA6WQ0HlWsS42cnXefvTPXnx4vcq8pbEo/slAuH\n", - "IBsrJEN1+aMCc9FNxwUPVbZVaWVjwLY0qh+mNWEaiNGRmacDXrYWw0NjqMPiLiFHacY5oGELRgym\n", - "S2mSo6zhsD1wKQ3EUQtwrjKPiDYc/HCqhkVwoWKUdI8xTS60kn4f5UqB0L77Yevh/wt7AnvQKQAq\n", - "QAEEevggRl1uigbOBTtscnYRnAj0edW4QExAzdo+RwLWXTzW/l3cBWTrh3ORzZQlxJ8jQTvPLB+f\n", - "bLazJZWFQQDcWhuhQ3gYcP1ruNwIroINRIr8px0UOgAhnk6CllxMN6gA5S0YPhFVFKd3n0AAAC9f\n", - "vYgISQAAAltBmiRsQR/+tSqC8p1IAOZemTPutEfx0mzK8zG8tdIxonBsDpoLZ+NnIOp4qK6idP1s\n", - "vbGvZz/zHM86Bg3q0yx2atmtgoo/Trt3YRy3se4HTjou+tCi7oJt2d7A8vEhVDu33JNJx+WCOgP0\n", - "03nVdg9lBs15v/0w7qMc3zqqJXCOy/Whl9aRhcaeOEWcD7uK6mCV8a6MpDJ959xBRfv2i/qFOFbL\n", - "Grs58WiGJcq4MQJI+rVWuFN50oiqBgiunfUrRmdviPYpNN11V9pwcOJwssWfIE3agnor/RC7vfLY\n", - "YoXzaJjtWLEL92OOaHLZT0j555xfb4FZcoJee+RXovB9IaoDdYRusngtBXPMUvnO+g2Z5Qdo9P8q\n", - "Zb8ItBAeHT8IBZAD/Z2nEA6qbxqOBSBtQNW6ZFYLtCTIoP/bLjCDHgtZk3cf+N1CpXs15pUIYWDW\n", - "elZtlTkM4w4EJlLdjLZyQPAeaBx/qoLmKyTKAEhm0hU8EcTq00f6fwkWgz2J6GTGtL/vJXgC8u4o\n", - "nTnf+Ou7sVJGVaouXxrzx+yGVHEcp/eV4gaFA95rInngQAOZWbA3558nK61JBPZl3NjEv5B9r9pg\n", - "2+SYY3wBAUeu2fgAB2+yYGw82pkoJJKpzYWORs6i1vn3GEgUTcwlYsdJcraYC5SnGvqSZhX7KM72\n", - "uE1e9bkpvpVyG/mkACn5R4jwX3xc2utCjjZgM101rirIF/7VfDtmJsSTDes+UVhbSr3SeMSI9ixJ\n", - "+fVuFZ5bnQPoRIfPc+Erw+K99JiGN+HE98/eq4pPlMY9oCfVPSdNyOAAAAFfQZ5CeId/AUuqOi5D\n", - "jlKfxuJGZZ1+rVyomjOIykvxtsjsuCiGtElbraCSFWcn3aIYWLrF3fPovVLcOnroBkiRMsdf5yJA\n", - "F87MQuoKeTaGOrxojCCCS64RiHrqNsE+7mfRRUDuB4sAEHFQHxBorgTukPSvrdFr5QDq+BhZj/6H\n", - "KN+IutwFWKX3ZX9pO3sI8My78TgRY5AA6FEcT91WcvnMypB/OWXzK6M8fYuhVVWipAZigjVOYhcF\n", - "9i6GweQFX9AV9EUQOp2qFbkrT5jceBRFLX6j4JUQ781/UGTekv1fcpCmzlpNpp8GdSeWxRL4gasp\n", - "F5uO5KW63rlhYccBo1cFwIN8txHNnwyQNiP00XC0PWDRZfaWSxsACRWrISow71IyUfcL7JNhjTII\n", - "rwDYATS0xZ9ep8siFC3JTxg1eNaroYfeI4tbkRHok47Vk+CUOQPuagVBtFMOOcy2OUbw8AWlAAAA\n", - "ugGeYXRDfwHM79ghzBo9nMnzfQPPIuvorxBb6AC8F4fYGD/t93kNSKNSEuhUXq9FKGtxnCkxN880\n", - "BPb/uTbjLTQVyPNuYlGl/gTlyLcVA/cDoLrl5TvaR/AcSLFE7C/t3kLx0STNibmdAf4TsHWKSblH\n", - "VWB4X7oQHrrDdhwIivRgUZf7f63j2XaGB+cbp5aHCCwJoovY51YTqsZZTz70FlSnypPHQBNzif7h\n", - "uvZkXhtEzpu9rYMo3YECkgAAAXIBnmNqQ38BDchAitLfY16mYQAQlVmv7062W8KLpIS1/zhS50Ib\n", - "b3ERigmkZKZMPaCsAi+zsLcku/gHGHnVZpuCZMFs72gmyuL4JFo6VjWcr5FtBvzIgD26rBNvP73P\n", - "nJjl3JImmFHiKjNez/gG3zTuYyCACuJCEYXyuEmzCM13hdCPHKg5GZtso0Z1qk6T1k2oiqF/3RIn\n", - "kyjRWuxBlHHmJ46TXULiUY14G+RAGoXI+u/G6muNclld2bq+6Zztuy+5ynaDWNNjuN1Ag9KUIx2F\n", - "XwNdepmp52/rOvISNPbMJ0U26OvqplXi+qHTbg8MLpUSIGCY8w9FZ5woLAENgvgu9M79yGlL20e7\n", - "ypJ4RMBqHYDpEz6Z+SSjXD8LsJ7VKlwo22A5Yukp1vTp6HHA35nV+PXK09DuRWKKdQUzmXVihF51\n", - "/+bB0PEFdoNxGdbbM7WveaCJN8XI7JgQWvw2nPlHX8M5QyPGSJ2HEexumoFrABvRAAAB70GaaEmo\n", - "QWiZTAgj//61KoCPNGHq/MxnjqmxxQAEHvTwibmyMZGX3ES9Abh1tMR+/DjR+6dnqRr/VxCl6gEP\n", - "wJ/5EYCYfGaGmQYsLOeM3v2SZjdvqQBwrwKk5A/63kFm8fc3QCLe93Mldv3KWXHdFT7/mudSntDc\n", - "vJwStG4jgi5LKlWdSrVaAxOmElsF+zWNzaCIQ1dOiZqi3JKj64hOeq1XIWyGvRvh6OLKBpB4rL6W\n", - "ugf7H/IPbSQuF5jWV7zL5LhxWiTiI+kAZTUMfO2YOLzmhCUSN9GAmNzgY4D2awYB4V4QTDjI7kdQ\n", - "tL+3Pmfl1HVilu7nC9CzQSvWIosiwv4btyHTL7IPT2gusybyNfW8QO133L6KbDhhXSDWUtcIFCgn\n", - "QUm36C9hvgGjorpKYr5VnErpJX6fRJm76fFYs8/nt763alyqdcSrqaTOLaf/72Wkkmlwbq3nLOIw\n", - "ADFDkkAPwzaM811K11iK/3HaYRT3nEhjJQFk5v4WBXwIVLAZeKdtC8YoGN9K6isN142fOG3s6fm4\n", - "J1nMtOEZHIwep8In4slLmHh39qBzhGZO3igiVpgz7u+JMBeFkVHe72vduBjIy+1dqvxL/TPics3s\n", - "+alwfTMNQKave1qW+5Uj8jZQTjcLAtKvzoako9VMIOfQUQAAAQpBnoZFESw7/wC9ZU4P+UeGsidW\n", - "4n5tFkXmtxppYvKQ+WGj/x3AAdl6+9c9x7N2b/yJykTvVggfpMnFUWtxla4sr1ouwANom+Uf4IBJ\n", - "/zXPovndpGdy98nJbZxFU4rrWpr8aI4YmRX65+IGTn756CZWwXKY5DyMgKnDcCtk0HEuoHgdGhh7\n", - "1PG8+nue+pE9pBHqiBNWAjPd90qfMtABmMShLoXtUObqYbqXhJvVjjFhKdPS03IF24fu9Z0ax15V\n", - "DnkiLmgyOCvJmcdIX70L2ZEECd/hxrSq9JUVjC41OX0F/ayI6GtkPMUuZ2xWkMFo5rqOAo7v0Zlk\n", - "ke/79TjeY13FNiowqcbhMwfDuwAAATIBnqV0Q38BDXNpg2t4nJdhAA5ru/5Co2KbB/AnQt7fa959\n", - "0crOQgtTxL36jtVyKPmfuQMYuWbJ/7bYTEV8sEjceHvN6B0CSEZzVCjaPLzOQJZMQpQ4K4WKPlGc\n", - "lnEwYAC9Dsejj7Fbk2RyCFiJinyU2HOscjUR6fW2jRsAFpVq/PtZDVPvesPG3AqooVaKHp9Ex+Da\n", - "AH0OvccSugyDKsRBAEiYR8645aXxbFSzraQsELDsIIr6HRN8F3lUNVBvzNO3mxBhq4th/kgZSjjJ\n", - "JZrYmg3UfIUO/jn4xs2XQ9Pa7Uy5K3JhuIQwAOUKDmAMC0p6fgz2on4ceyEcfiCGDPZpPyL3391F\n", - "dXID0ctPQ1a+Hk7UcAc9gSDL8CZKz59YyO0ACPjfAKV3Y2dbTAKdWBsUU0EAAAFEAZ6nakN/AItk\n", - "aaqbMCcBE0iEIDnEBfRZN0neHQxaz5DPSzK0ZSL640q0AA5jkP0YAYAumNCN0MxJYpWFoQ9r43H0\n", - "i9SZLdv1UbgpG3aX6KESZW7AgdlevaBngH/w8xYsqWx5t90zzi7x9VyRYpIAD+XTrxvgBoFILNCs\n", - "gd+zDA9uvbAPlLMwG/qFltlwvLokMt344erv3a/C/ySOwZHFzpakInpJ7MQHkmKi1KHZB5KrfqwF\n", - "FnglZJwWbe7LtVojTdwQnAksziDNlEWCkMQQJwziY1KYtlXMNX8mZ3MtYR1KNf/CNin7/ys9ZQyx\n", - "4Zlk//H5KDc/8O2+JaxH20CAaAABxgSxo+yJal1LnRHYfOQ1TygNueW/rPAA37g/6fLS7mbYKz7k\n", - "dsiSiy1mAV7n/qq81UHJPShQSXK+E4Y5XKuXEWG4AAAB8UGarEmoQWyZTAgj//61KoAW7kO9JCjl\n", - "XSE6nAngAJVxWWFl/YDS0gZ32xjwUFed4hmI6rj18z16nS3Mz1iMmFblrtaE4zGXS046COODiIwH\n", - "QG5lRmcBExMKlnynQruQtA8n/NitzdP/ysLrucGyp5nKV+XyJURULfxk4kwNp0a5TFlJ1fusOOJm\n", - "y0hvsvEg+d4Jz3anvWT6M9n5A84CGucNifV+WlN9gI9gs3qSoCZdU/gglcFYM5u8YchzhQFyMKxn\n", - "kpfWK2LU7aaZHt6xLbqjuv74523K9/dtrrsFq/LySiv1P9Wk6/6d5RC72z4cyaUq6hMMn4IWWRo0\n", - "zJIM1/lSYsWxt5/M1Mkv00Rt8OZvmLxuFfd1BIVlANlpgZ39RYhqqzU6v1HwaW0EudelFBGhr5mf\n", - "GaDE05Z8ywp5rN4Qq4D4GNAGD/qgEjtaDDf4ZBAD/TAHBwxfNjm2nPAdbbbIuWSkkv8NK6EMlKqH\n", - "mOktd+CB3P6Szd1+HPnUsyQ3659r3XLnoi0cvM4usfW+BgxqT0mgHSgn/F6ajdTNM+a8xJQnT036\n", - "7195r0uF5vwi7PIviCQ2E4Vs4Wx80/8tBDEJS4qOY1YJ5aNV1OV82fB3HOimLHd2vU/d4Cv7OBh8\n", - "k3gNFcjeBGh+3lQcDCLZrG1mAAAA3kGeykUVLDv/AGVBMHxAlJYGEpFnv2bb0ADrwvVKxe7+SIJI\n", - "g0dPJdL0s9Hd2mGX7rpdIiUH9ZgtnBO+m3uPNae/YtN3u2p0kkCez2KiPNqgSoEcHM+ePgq7afkq\n", - "0HHTSZl/+QbjsyfbI/0lv1mLAJUd3u7VZPPHSdXK3vwLfAwOe3Nid72slU892DijWVvanzM1IzDQ\n", - "XfN6x6GH2qfaLrHePrJTJxXC/RSxcAol7x2JJ5OA8VjN8jXu0yKirBiYqgcdFf9odG8j4bRmE2wD\n", - "MG0SKuGrJfd91b6B7hbRUwAAAPYBnul0Q38Ahz7YAbwPIqnkAA5sEIcKo2/sVUP0LEeFOLjKjaet\n", - "5YFAjDbL5BIdGqWouG/H8ozoec2ZpUbIZu0ELtG5yXc/5opSZlnqbOpqdTQkLs6gr9dv5GbFvVjS\n", - "Os1j9FIMQsdc8pttosNtygWB8gLxr65El6umAZE5CVU9Mc8Xxg/tenmTduGK9Cd7qRDiu1sLYR2f\n", - "or3KBMo8ebz5q5EmWucvREbYSziQIIycIwJg9OG+aH+ZUEQbjbfHfaiX7yoxGJGP78aNOHP7GvC+\n", - "JwM6DxnSyowUBAqkW8ckgrhet8gYYrt8MIe1MPJQB6sv8hHuAXkAAAFWAZ7rakN/AI9XvmYGr0rf\n", - "QEvrPPTQWEAA5ru3wBCXPJiC8OaE25OBvVl2wRXqp61wQU4HxGJCAxkSOz+G3Yzvg36uCK8bPZTq\n", - "avaOG/H9WxjsuwAl/bIYJdnyD151CiUZ34aErVIixKJ53oKrLeHr3xLgxuH+y3w5uH5lQRsL0Pmp\n", - "0jQItTBkKwlPywxFk55pROuYZWi/h/N19QaFlF7WPobUElLlr+nCH+pVt1nW9/YwVGz/cO8zwmWe\n", - "Fb0OnFji7CYSsi9ScC3a50GjUP7IpaY5NAHv33V57bkO/BD6dnreymTbSmQdcj7PAJkvz610fMqn\n", - "mDGTMB31oxAIE5eWeH7mBZouSgmtxEamul7sYaTPe7mP6FqNCz0h6wLot/zAFwx9/D2+XB0x8mmS\n", - "b086o+gqkoYoHQeQm2Sb3MU1Bz0KHDGo9jCmsBmecxs3oNHV4KaIoLKAAAABrEGa8EmoQWyZTAgj\n", - "//61KoAcdmk2P6doyaR4wEHxsIcmssCD5f+3/v8PGtlbWZ+A0oGGFPTAdgmU2TFbrRxlmwUCouNe\n", - "8freV7blHDodFImzwP3saA3AZT6NUl7vDGH/tw5n9y8rP4XGnhEXBHK+6jIhoAYc6G1CDX0mqczJ\n", - "7tbei5I0YSkDjza4rJSbAF6cRoJQH3s2Q+ggBQR0BfH6N3QlPVwd9YFvP6++J+XrbNU56Pxu6Wey\n", - "51asar4AaARXHregTXL4xn/VNt8Ppk2xD3/1jXAVXdqMlS0tYGM/TtrcuTC63Lx21RQtklG6k0xA\n", - "eWm6W0oL0KTvxuyegpC2ySp5v6zpSEYvzWR4IYirfT0RYU+jLtX0t4M/L/0k8xOLTHbouoUPD6DN\n", - "dYYLYlVX5noJzjCAVCiS21OCcIKqWD/YiU/+dTZpdFFNdHEa/MPvUEq7cJD7ANJ0YUweepq2Eqdh\n", - "57SC4Tpg6jyEnFgMaHQLSz1nJNh4lxM1TPouGZ9bmQdDr9WY+nwzRBa+ZLnaqBSYKWSKEs/TNtNZ\n", - "ev7d+EnJUf9G9CAmmiSDlRAvAAAAz0GfDkUVLDv/AGU2nAwHHyQlvUxuENDSO8vXFIAPilnMlQWb\n", - "nTHwb8wkIo6JKOaIP9blrrNXcWeeQDVprB1Bn//+nbSDHls1apJcUyMHUmojA58P91gutTiF40zp\n", - "fDaF096G01gcvpH5Za4+DfUvxQpt/wH5PntJzggww1tLhP1NyH5U2TTgrnA/BevK2aCa9xCuCVgA\n", - "JJZF4uqHE//COeWbJ6LIFJPoadxAxbrAcxPQQHMzEG5G5S3Yfd+YJBLrdO35JvVrsUTYO4AfvJeC\n", - "zwAAAe8Bny10Q38Aj03WPPyvISnWAC7KM5WfLH925SBeAKcvJaYOa5WZCzX9H5nU/7qAFTCgAnl3\n", - "rAoSnKk1337XDAnLfPYAAOSIcqQwF++e4HouwNVAWCEsVyl7Y6DnBaBT2mD1H8560KoMvm3kKNNC\n", - "oxFCc4BdAIXk45JUbGFNGYAjCbBbJInMjwa41HA404yKnJG7rNXdBctnsSL/36UoXvVx3J2tGX84\n", - "+FHk7e72CsAyB49ajd62idmFQji9Jj1GaiqtCIjWs5o6Mz8s5QfrvipNYYD0YZ7gBBGm4AEz17d8\n", - "isscgsp4QI2odbuEJDq1nfJbW6+1HGcN1XfDC1Xfa5IptM5UYHm5zIT4rSPBIDE6l8/NhVxlFP21\n", - "JPQ0DZxnZFvxIBznQbqkhaGZjMafgFoRzC9Nl17x+K6e75RlplRZtXaUIbjAUFBJIQPkoIrT6/O9\n", - "NtkAmnl8qqUC1RktW/RjiJqOyRTTITHqNKvKy/0gb88xEvvGPgzcSs2KpkbHJWmCGIlSWEkuqcCE\n", - "jBn3Y8XOQxMUxEYeLPJ/9s/F2fT5NAnko+RFlv75fWLekZZP2s17yJ5ccFGhZyrkGX6u7xXK7N8G\n", - "Qlz8qfOHvgMQrlB8p4j7qtnPgBPf8mcsM295CuAZxkK+sut074W+0hM24VMAAADaAZ8vakN/AI9G\n", - "UrhSy/Rrhc/LGXguupji5cAHC2DVoxU1gWUkKeMT366GcmuxH5O8lBZJeHl8r2KNT0EaVARyW7pN\n", - "L4uNsKKl/WAzLJ1OZWTQf4NaAfodQGO9KzZS0j6oGvr/urKiQwbP44Tv//glYQyyCFeq+8nnrHBj\n", - "aACu2w1otySh0DYMX412uY6EYcx3GtQaRpNPiKQniWdVV2KH48fVxDy0uLS0SmCZEAWLVNvtWqO+\n", - "q2OwCBr1m50s0i8eRTlSP9xoKtxWC4ZqL77eAW3kYEBJOAywYUAAAAH6QZs0SahBbJlMCCP//rUq\n", - "gBY3NzYDjVIwwAKbp/vtZn3NtK6t0V/4sA0MV4ijJVoTZ+e36T0E9eQ0LOyzsqR0ULZJUDRy41oM\n", - "RdsBwM4wyEJC67daWmuDEXKhZo862uqAH8A0QJ5u5RKBPFpngChYYJdWzP3onEWImG8Yryy/SXt0\n", - "jQ5te76AagLius72bzwZ4AZfLm/04ID6oXhPwqkf1cNsu4/kIt7oCOETiL+lzwHLEnEsdPSz3DxD\n", - "uLGkH8o6jHofDxEXcB6cOS43aUxGKPYPtHCj2gw6RzcRoX5lD5mwqtoCTxk6N8TxyipSUyNnbA2b\n", - "G5NuBUVLHTce3QKY3SdkbyH/wzdOpT3YHUE+FYQwMKCF6SMyMBxp2gI9k4yUZYljUiekF2XIFkfv\n", - "TFy1RUmikOycLKkTYTreTarsMD5JfjZ2FJWrroj/YX+uNeGtKNZl9Zyt+k8u4Htq1bPYEjCrLHds\n", - "qeIuFWmvxTYEQblStjDXmWfITtxy8KvOgn9iV+KlidrnVhlE7Dz30fuHXxxFZvIzhgU9uv6sSC7T\n", - "vZuGMsKGBGTYmSe0P9hLI2VyM/8GUWwG/AITiU4a7OVDjUNRPaiIEt8jt2oImPIY8qcrJ82CVd+P\n", - "mSjoppoeHUTHmeo+koGqjhwT7ueVHNT5VZ4yuGKEDdFfEIkAAAEMQZ9SRRUsO/8AYrbCELHs5dcg\n", - "AyOPuRHZUWtdXLx9XaNQixO/8Cc4Q2MgEa/wKETsHiR8C1XOv7rI3JB0rg46JfjEArbHaTHmANKo\n", - "+czcI/sIduYNFOE3TvObMh/KtGpZSdF+qnDDtY8zD+7RQUdzmkG5zeDj3u4Vq+f3qnKCwgbU+U0R\n", - "dQR9Q60wXqL03p/iYVxkI8jJqvkECuxT7efJI+5rmzyP1yn+WKY2EsjjB7bwwVfe6RxBmzR9Ed/9\n", - "CA95ILUJxNg4HsmCO2Ko+MqZAH3wMlG18kUm2ogL3cKIkVXogjofyKhbsSpKLpFFk71DzB6NrY/3\n", - "HfknWM2yn9yeQB/joufGEf/bvMAS8QAAAN4Bn3F0Q38Ado97WJWiqN4XS53kTA5YWsnJBdebpf+9\n", - "lcN5zPySAC6fH/XzBsBKbxdm4pTiPFVrmGXyhaRiB6dxtlwj8MyI40Do8AXHq41BAunk4K4PTgzR\n", - "rFycWqaL549wB2C5jNCLXlq6Tuytik3ijlMSkx9noeIG2Lc83eWkRkQieksQSO4xI1tzzkdqaNhG\n", - "ExZARu3MauZwrBopslb/ZLdR5ZS0G6p8o9DD5cphJjxJoSV/70/0Gr+woS8Zj0JpVvvpygE5bXQp\n", - "/YBCqjmq4uOCyt9SvCzPelUEwXEAAAGyAZ9zakN/AHZ6+HiwE6fxvgA5rqP9zmI+FShvhJS43N4N\n", - "sc5a7qq0DK7DHadXkQxf+APmeqLrIGM9X5aCQgeyxdoAlcQoyNsm6ol85w5z6JV8A3YntmCae+s8\n", - "+8/Yheg1ctJWrSharoeypUyemQeq9Rm5cIkSOS9Ej0hbIHyFhPQW6K3SawgMNVKQ0s1BpJvXDQSY\n", - "x3jIEdIgEtwe7zce/DjcO3RNN3g+SlPoM7cl0qJbM44NIDG9JGXcwVrY/YKNrpChX0yegP2ZHDI1\n", - "MzOs5eWP/2l5loJrLid2mK4Qhw6EGFrIadsV8rSjzgHRNuzJ4U3JdubidEobU0ehkU0P6MYRK/XM\n", - "58mVywGbsw6LPu56h1S4w3zHGYMd1zPKOsnCUhaRfrSZTxvjerNQ22prVPqBstk4JgHdnSScrwGw\n", - "eQcqvIw7gKhonPDKM4fJtO4n2EsI5Cd0iGMjmgPw/PU3FL8ZP3QbYLMwZ81Wd7BLLBDf+ngKiFIe\n", - "it4neyhhaE/a71b8TxeM/ZrgH9+D76dlgPI1ZJW6CCVyIs6Y5gK2plkcgRYa0MwWF+1A6zPtBEgA\n", - "LOAAAAIIQZt4SahBbJlMCCP//rUqgBY9we30eRuAA2kMf/9/gX2SHKs8Uq31+W7Vx4LugxILnhMT\n", - "6icG5WQzdpL8yjIXjBq99nVaYweUdJE3LrdOpsVxNJ3kODVBkposYOoRuOMi/SNhcjrJwShp6ljG\n", - "Qs7tSeRJSYDkvm+SI2ckjbManbEesw6wo2ZffuryaLuWkU9SNALC+2QbPJD4bFy7sTmB9+6VOdMm\n", - "rnLvYN4ZyAJz7OhQG85P+JnxdgXgvSv66sWBs05p3vOE+53H+HQCMTLVgvoYmHNTIYtZ5CIln4hA\n", - "GrjLg53unVVQTiYlSzZrRE2vmtsqac+v6CrcbtgC4HktflvPTsvgqWNHri9NWa+EuXgx/AgGkZVJ\n", - "r1n6gAd3jtjLtv6YvbPiBBo2AhBUxCbYyroAjcvjwUBtRjXTdDEvdYfItmTKA7W3+KvVi/PCtod6\n", - "/3gOoaA7zRdO+8+MHlGl/c2xzQhj2O1n8eJkOu+NcsBkpmxyosDi11EOEaiQ6vfnOvH9MSM+7D/v\n", - "k91SLlwv/nF+5eDPHSLZQIoFUjHjwVoSGCdOLqmIe6tsfTERCeAhC+1bhRhe0612KIL6izjolsR2\n", - "nUgrl1o39HqnKAVqQ/HguEezLTgmGW27Df2kp4E1wRl/EQgEcsMfBPga1ndY4uHPYq84ArNCWk+c\n", - "YwxlHAPVC3PK3Zp2kQAAAWFBn5ZFFSw7/wBXFVHDEfqz5TAg6AmqzzGCl9B1ICKhB+tKz4Y9Km1L\n", - "/vZyZ1OR5rO815FlrTgGoncUDKVNjpKrVerCm+HleHb1b4FhYQG8B61zGq10uLuoQHIyL4Cv2/mm\n", - "s5Mi7ZftErBt64oWYphUyh0Hmn9dYYheGFzLdE9gvqcAEGJDyLZq+nfiK0Px8pHIgaIfsEdSUYcC\n", - "8Otyxta0EKY+Dm2m8AtQ8jjuDmkSHm/uLhgf1uCnztOKFhkR+ydRCeR9tnIlTfiv3gJbsPT8swjP\n", - "0OUm6yT8LhwwCJU0AGI9hN0/kTkz+NeSHjSPaBx26MAfS2Y5NEtva844h4B/RttjqxMsNDiDrfB4\n", - "5xn/Cl/3XrcF40eivyUSC+FHzx3M4BoLQLOKf7iz8hKiUrqRGVkGToUMxkr5192x9xCjbuvLRMd8\n", - "9Pel4WIOhSi52xuSf1eEhC5VVAp4lHpZmHCbgAAAAaABn7V0Q38AdnTaV3jxqK844c19uepGJJSA\n", - "C7DQuTz6pWfCzxcMbX5JwHItpyM9y3YT46z61a7h5Lyukp+nSKoO0zQhT0EB/u6ILUCNvVbb/89X\n", - "7TVI5UN6EFwYYfi4uoFmqb+5Cd0J/+d2405yTsK/f6WH/T+vNB1DYWrW67ctgHOgMHAWDLG9mitl\n", - "16bXmPVSi2sWzpWYg3147nlnaD00aZHqQlrMPzYTLLFwWHOLNqCoWpNLMMEevc8AnQWeykk9VNTU\n", - "NXzAXhrKDXl1tLQTxZG7GX3K9cQyeUnjfH3rMBGDD2zCLGXrMfPVl9EJ/F5M49Rjn38sXUf2JvF8\n", - "D9r9tV1APCHN27+egfFIMDg9OhrQMtjAe3WEfpYS7pl5yHh7ZZ2CedEo/Wf/ygYTAQFI72AaUTrV\n", - "n47d9OSqAdYs7lkgV0864auRyPQeTKK1Sp3ADeIFS134VGBNG1VnrfyZuznYkI2r0FVkGFrAXpUu\n", - "ZJmyKqqILhJ1OTBM8C0VBV2QXBYa2aSn2jj9t40/wJJWc9IGAVR0vj/u+wFocjwf4QAAAZYBn7dq\n", - "Q38AeUc/pR5QUuADgu7/kKjYlIf8yn+MfKKvFMJ4eRJz/DRqteBIBJsZW3T3phi3NzuSw0zOvEhr\n", - "CHz7xEUteyaR+fa6YCBeiCtangbUerW/UGoCobzV/74XB/lXH53NcEw+6x9o3/ZgwG/7l4psK3P0\n", - "EqSwtCrcKAAv8Wi0Z88mFp3Sp19shMF41mqYa8pNsyefrruQONS60LHg/1GySbrTeTWW74lCDwnt\n", - "BGXpwghp/QF087PP7hxkE8lvu8APh5F1FTiOCBSvJFm6yFC/tz24gmveLoV4Rq/qtYWRE09VDCDH\n", - "yjftToPMsyi4DoCtXsPRk5Jxr9Mn6xDxGjfz8uMmOKJ15ejPi/Sx9cR1QrBsU9dhcYifdB+c0AMF\n", - "PolB3N4pBZAASP6m7EzaTer6yZ2sIKcQdlGt9xsZ0SHtS2313gpdJkLEVrHpO5/BTcfUTTcK1+bC\n", - "PwRYX+iIyInP1m6htprdy84ySZ5IaGCpRKFxMCf5w22wXyyon+dlMPKACguyEPTCCZQ2MqEuC+sa\n", - "uB/hAAABxUGbvEmoQWyZTAgj//61KoAXgR9s4tVmwJ9HTza3s57iAAoQf/wjqzjlXnP+29f12EfR\n", - "S7B+4I2epG2qM/uoQ7VlrfXFlhjyX/aTq0n55QXAKa2xUKolKsuMfmZFFc6+GP96b13JiSidvPgt\n", - "2SSGnq9Yw4MfceFmgOaZRcwoMnpdb0UpI73YdP+DfypKyrkDqKWcBc/BGhrH8+XdnpCNDXfg5rMl\n", - "b0uFlQ11yUxnDYOfRwLbdjJA6FYddawSEVorFtY7jkSQx+OUBUgWkKC9rhKB+uV/yqQsvbuFiyYV\n", - "MviBpsZgSSN0TOC5JedQ5H38ENVBLjXnWZD9PQyueLoT4qwtI+7lodFSnBG3zboWdj6P7XDbgKT/\n", - "zKkFObUjwhstiQtohzxd5AXhBH3DQqNv6mRzuMxFDcTEo5ut/0/1HrPGOF4R3sJ/eQT+YnYseqvc\n", - "0m5njpgI3qkLmn8efBB4q3zWGpHCxBwC84HKjuugMICuXfcJHKn0aWkn65aEjT8AdxDWE09InGyo\n", - "EM1wsU0JgJ/qq/6MdHWfQW6+bt5xWlpYJ4axi9wZc3Aoz+Rixn8UVM2e/bd31+W37ucz9udquxnL\n", - "2JdNUAAAARlBn9pFFSw7/wBZVXkLa/7xg9HEtDOpc+GkSv0gCD3x6eQNkROUaCyL6QH8m/0USPLW\n", - "nllgC+uXg2X8kUpaUiErsLvwKd9y+trtKwV7xlvkAn0JqEnToCvptE1Sb8eF86DTi2ywy7WE/imn\n", - "jNBYQny1cV38ScnZp/V3phWQAYBG3kUdNNuj/FyVB7DgbQbTLK48AO5nLYv8B3LvBNBfBJ+ym1yg\n", - "YJXKwjm8kt8xUjO2UGKeggZOs7YHWr5Fj8OX4jV/B3/cMzP+f6YyrayA/80F6f9vgrbTlhWdlFQ8\n", - "QtrHKjmrl874OSSPJYH5wfQfF/1NrQd6soxjmSWYI9/FqOPoy6ujUPxQvg1fUda+wK31Cv8gD96H\n", - "LPqpgQAAAXkBn/l0Q38AeBaU9hYCjxV6lA176iBcJKIHTfhwkqkAB+a0LmdvcgdK3vyEsSkCI+8U\n", - "up3OQ4OQId/B45+Mf5P4Fc2VsfnQAACxyzNkvgEEYwZk+TyOR6/VZmeFNYMrBdqc2NNBlh56ISK/\n", - "h5V9lagvsX7yv0p9Hk6RXo3uoMgKhKOv/QgBAqhUvAKDw4DS7G31tehd/myRMmCPxIJ79bZsQe2/\n", - "iq7Nquzc/VDpPXFZHPvOmiyfyrt6Fxc2jLHZJGpvacPTIeLJiSaBxgRTEKBr/xXaKQjc5nLhlwgc\n", - "HSz1WRlyOsXOkob3rY8KoGVETaaIvHEl7sVHsV3QN7iR2rIGzf6YHv+c3l8OW1b7tAMShtcCLifl\n", - "8k1OtS8Z5o7MNTObuLXIONSPGo1fC97qRzqHFEfMZntEMqsFjjWPM6JduvRiAv8p/h0kRdcTeRox\n", - "t4PEdFJikYgCJgtFa00LDpNvd6Vv6MImiivCAgL9L7zEaNCr8p/p5ZiDugAAAO8Bn/tqQ38AfAnX\n", - "r+Rl0wYAC9kEZglKr0YEZPxbFiynbDVLyUoB5/4mwbggJCKqWcWLXkOc702XkfuMANGy7OD7QUCV\n", - "nopFHkp77AuzGvvM2JQndhYVkdbX30/kmHQDID1DcpthKQBbzUjm7wgAOqbulxKDc1OUw1plN1OA\n", - "iXs8Ju+zQDtZelKPfekDEF5iPA8IQMn3LLocZ168PVHW73hdmgfMFTsqduJxZ1oiezDuUBPUKdNQ\n", - "1lGg5KUsS5A9iNuo+n1shJKCmk20FfXGeNEywAjYeaq4bao/dd8nZn//htlIayY083IymAgdHbKW\n", - "UQAAAW1Bm/5JqEFsmUwUTBH//rUqgBbB5O6qXkABRezeefAxp9PjwxeDBuTTFSUNk2voPSz0T3Lj\n", - "1K/LmQtEI6YkskJKgxvIXHGf8LHTV/h2Mg/qV3IQ4zvBygOQs98iZyR5jgV+hQ58R6xIcus/6y5a\n", - "HrkViRrv8Sk7So3LYWmfkLzyR6vcCKhF/sCJsY8RS8BK5OOGU2Ll4Qs1n4jPQwTLDELf8SF2+07z\n", - "zB5hexERnOHmWZ9THKXS8j6NXPrj2p32k0gvmlI4b/Of9evEX9mDBp5GtQHOvTswQ/VYUajAUXz4\n", - "5w6EHuB/k+FBz9pe+B69syJ2X5MYn7Qi9rKpCl2kZv4uAWXuNo7oIaU7hr6elcFz53tdL9AEjCAb\n", - "BlT3p448134hjvo9lj95CHF5teK1w+R310Gc3NQ0eeJcsiYD2EoVrHHjVDF/m8I8JtTUFdJ3xm+G\n", - "muADOcIpcqYbeqyKWwHmgvRze+DMQbkLo4AlgQAAAR4Bnh1qQ38AfBSmnoPKZzTuFWeZOcrkeWeU\n", - "yVIALsozlefbqRZf6f7w7fkPoFSkdlxkJJsnO6qzfbc/Kotbm2yeFrIQw5yspszQL8gAAvMHKSnw\n", - "f4CTQ2vfLY55MADj1baDD7LZtn0UK1Eh1HnwXobc+mdHd/JEl/a2Tszf/EZ9+J7oMl+BYsjWKwNY\n", - "vOv5flnnPLcex/hWFIF4n+hpBybvasl5hI9mV0CeAAyAclftj8N9n7hadcpM/TOVmHbSkJ3cr/k+\n", - "StSwI8gY9k3tmbMSZc42caMpFr6YdNCCIj52zmNBccPNFxW+UT/4qCqtX1gc2j7obKDaWzC1yj1A\n", - "td8/VAjqVn+FzuuEokhhvubRT3RCdxeWnBTCG0CxwC7gAAACMkGaAknhClJlMCCP//61KoAXgkIw\n", - "VJpvAgAqN7f+5rJJcY8tkjj7p4LozjswOy2dTydK33mOBGS+NojRzBOlwt3ro+/vdQIUTIVrXKwh\n", - "2SrHPCPJXQoCjJUPkRODCmqbZeBHsv1r7iIOZPpX66HYYhWgPLvPzAb/Nqu9nQqKoyphhNy32+S5\n", - "qAFvjRKLSjPAx7GoKGUNMbYduhsBsrvVTwhrV8uWAls2mxYggJzVuRUZSL9cSt+tjl44BXjlbo1a\n", - "I7ybNHG97GCzcbSNcg0RA+iqwDsdnrZCO0zsNdWK1qVmER0PsSf0dicSrZwIcxZWy6JbkwQn5TnO\n", - "kAah3wAs6pJvW+a5ZiJHl6sVlU3yCOlrECAESqWu0YR75WfiMXgesBOuXGGNsC3icmPYNzM93us1\n", - "7GQTI6RmmFHGo+B2yAB2YJiK1YN/T0ltUuXfFAvL4UdHgEXOVIqVj+S+YpITMKy740IvYQ5zuZPD\n", - "ahdXF7HIU7xE0W12w+6qkuyZwxUMXLXdgx6svudMor1GNfDCdymcKIidhuuXh7vdQrgbivH7usVC\n", - "zjMqgjGahkW1YlmytCooEIoULx5ux9DK360iAi4u/nAomESdiosanRfQ9jQdJSpo4rurLfeCLF1Z\n", - "XsQAQRTcezHlxp1tz3A3WsYMA9urPBB8pUlDdB63MfZDCBphVx/Ddv1AMvPXFEPu18oREsV3BdKx\n", - "e3lxLWWpytzF3zXttYGgBb90j9DgRGE1uaAWyEAAAAEiQZ4gRTRMO/8AWVV6uU/hFqUNYqrP23yu\n", - "FpB+ECoAQNVnJ92i7ZF1i7u1D6K4L4gxm2RaiGsRDmf2iYWEjO8yGHAqwpcDep1/+H221WMh98AE\n", - "VV9Ferf+hy0D7Zu5rX4Hp3s1TpcNcEBIKPHVSHIzaZKKfPXkqE/ga/eepp8Bzdc39OW6g91hVVvf\n", - "WJxrnf77rapWbmivuJFfeO9u+RRykk/agdEi5E/5a475KGQprA2yl390PNrCvoamPyXbETwtbYAQ\n", - "pF9uDZkHdN/NQ1P4rz+zQLJx21eQsP9WBLswpDFYg9BjPw+3VrVEzeid2j5wJBlq+56Hw+Ex6fI6\n", - "1O0GbWSAC5/5Zg+kGX0Yx7/We9PseMWGwXWIVwqI7oHPEnK6wUkAAADgAZ5fdEN/AHk02mburIzA\n", - "1V5U+8CauxZABexQ9zxvy3GIkNn2+19EyZqnRm0DMMsXP4ZwiY8vW/qdBTlATfbmIFDxCTzt76+L\n", - "X3WaNfG+rqTfzj6gLFFHl5IJDtQmIC9KAmTgQM0Lp8TEDdYJnPYGFybq0Xdyl74+130DteV0SYTD\n", - "hgB6230zJvCx8ZW04pZHmYvtJ1LZAxF3BAWKPXcstkh7/Er8zYdPblR7K6t0r3b/sIHpME53VRBk\n", - "ggj1uN/p+iN4KwToxjP8kZ1opB7xpkyOQpicygiGnwjU7EpZpywAAAF2AZ5BakN/AIdka2Wer/IA\n", - "EJVZr+9KNmiS7zXHA/5uJU6D0CbJOrsLPWcfwAUCZZjhlCsnAlgzrrGOONmuxU3En1TfTKb/7Pu5\n", - "1R8PfIYkV/dZFitvMyRPMvzwXX1OcxtjbhM+M0LCh6zNEWJFi2Pi95t8cspIknD4iXNUblA3oEFp\n", - "VGuXt+8S3Upf64YqAxWADhb5zxXL+O/gnWiyawM9fyRrYcExecMkEiv5MHRsJs8Euzdps1vwxzNA\n", - "Zu4bu6ic2K2ueNja78qXGaHz7xLoPIVJv/T4KAuseyOhznfFtKf0Ey0eSBVK9qutGGF83lfe5Wtv\n", - "xb73lHTKLAyiyJassoDHBSQLAcUPb4nB6xWNr9G9gWtqEIp4Or9tKJzZIZ1tnIKZFZGb0ELAlV2+\n", - "pKKDz5nW+syHi871Soc3HtgomT3Y1cp83yQG1GdKkcJPkU1uJVzsVPzbXbSU7/z2Q7cikc4seN2D\n", - "ryQ1l58HjUs0ikCXV/V/CDkAAAH6QZpGSahBaJlMCCP//rUqgBbmS0XBN5gNQAaCJTjyhVwVkMwl\n", - "GF6KXnd0XUyzqjFCJEv0D2xQiJu8if6sKo6qHl+BP/MZw8ss5OKq407INzCjWOsjf2HTKyC5fNLK\n", - "wiJv+PzieOozn64ZK7RRud2QUaDe0kuhk4uCClSYQBImrxmWeEf/X9zH3+ilYhfoZigVm0IoMiuu\n", - "YX1ERVdg0Ld9E6wxbYMiQAGJU1qeeTwc8vb3w3kiJheTA2PNXtrJ98RwtpnhN6QxMe1dw+aQWI7S\n", - "j0oQ9iNx73N93RuNVRxXj/57S9VltjA0RTZBjLvYS81QDA3fBgaNHNzOBZ7dztz/rTxxOpumjTTw\n", - "x9FgnvlMsjx7FYPKUcXD5quVKd8lwTlOiGVI7X1HEv3Hh4EvpYVt6azhUBI1qGunVb3X1lyMhWJ9\n", - "p3muqcicwInEt+BuHY92HoNXaaJJbbQmNX5s3QJbI28Pg4gc2gaUF4SQRcBgM8uwcYUzxEkBS06L\n", - "0moZm8bwMsLYCLj3fgXOyFudpfg6jkYPDeVK811WbzEz8Hcd42XVL0EwE3bwDc+i2I4+NERo6J6l\n", - "d4d7nOIvqUuorZnDPtlYcfSWgBqdP0tQHvFb4Sv9QUCBvXlH2IEiNzo/daaHVtbFRNZ3cag2HOiP\n", - "lMxyt8xYJMnG7di2JiwAAAD7QZ5kRREsO/8AVwwP3fRRACC0tQoY45xe6yfL8KMHlR1wbd4HcPUC\n", - "+4PcnqOzdoNv80ufRyOopFYryJahX+qWFUVKK+nDtdvegTv/PqvENcT8ykEwwQ7z2oNUdaMITYi5\n", - "4tC5YA9FaLSBorMGx3aocAbiF8065MBqyaTkiW7FtGRHVSPubGixAl7hiQRoBoEipfCxkE/EBoII\n", - "omSCNrFRyjd8oY66cDfZt+iBI44uLDeP6eHMEpBALsV0FY7iWjBLaYO1t2PsklOb93SAExoyIX1I\n", - "TiPXiUgrCYe7dgepAF31BCnOuxiIAPWKLDHZLhGOJBLqdemk1EZoKCEAAAE5AZ6DdEN/AIteG4cJ\n", - "hGXgWAAHNd3/IaNiUh/zKhTXYgf+UKkbUvWJoLo7whMXByWkvy3MotNcPaSHeaKS5vKy/hBJIgk5\n", - "CWcdsbd5QzFHyjOIZiaEAA1AziqRPTDRRVYKhcrm181rAlAdaYmvKZAOu92pmI39/PSQjhiMouSe\n", - "XVT3pg0s+/zN7WMQCHqTmey2TTctwD0YnAH9CK4EMAw1jPCCTXgop9epuL/iXjup2S+LS3pGE3iO\n", - "oIHon+1ERGRC2Vp3b2QAstSXzK/2zI+bVnxf0PhgKqa/NeuEaF2SBGZ/TyqGPDnQfJRorCp1s+mw\n", - "tm/3aVbjKRTXeSwl+OCfF6rMqjf/Zw8/4yrjLNmiyOgD8OWqATkM50NFqOShrrTCaHdcxgVW70ss\n", - "cCXKxvzAUCe+4nK4C3zP8QAAAWMBnoVqQ38Ai2Rc7ISR6q0L0pberS7nbElvP1eAuajd6ehFPCEk\n", - "va4007gA4DkP0YAYAumNCN0kma3A2DvFPa+NTDmrilkXNhiNVTFRLzynsy8rdgQPBH6k5DFr/4eZ\n", - "jmJjfYPWB5+2eEYYc9uJ5Ni70hsVFfV+T8zp+ZkLZnd2wv7AZ7A8baF9R5O9oQlCkoVPxkDHTrmt\n", - "rElQhX8Fi0yj2+BVP5O9UNPGQU0+M3KYUTg9yTBG2cCw6Drt49/5M/86NN03F5R9JS9KGOfJjIlA\n", - "koCavGpTFqq7OYU0RM3ilfXBmxvL5QoIK28Uvs71J3h/IvKmg4v/14n3/eoSpqNUCC77ty2SgAAi\n", - "rxQNIHz2GF/lpTynlwsORrYNT1lJMVud8AAQb+/SaHWQXmhJ+8cZTt8XuMgG/t/hdF6GqyG0A/Pn\n", - "hWRq+asN+zBaeyQUWZrjl8ry0h3WPkAZksFb/gV7ABWxAAAB/0GaikmoQWyZTAgj//61KoAWw9mB\n", - "34Nmlq4DQoTYIkneVdOFHxDDrFwsv7yxZXXwNkGuLMduj7QGT/7lr2bNfzApMJfo9/ffM5g789Cz\n", - "1Mn0zxePHMHBL6IHHRVXWyqDMhVLYnQ9xFtc1jml18If/8STBCOf+AZjMnARcFmX1IwLt/ziVSoN\n", - "e4GPKKZqfZWytoW7461OuaeZ9dvtxrCL+W45zobgR5vOrVM+Opl+w/eFlupHlgpQBWgJcPy8sZC4\n", - "/O9laiYA63xx6M701UUvGFsRI+RM6anXyjKc7TVrmZ/YQKRjqB6Mejs2G1mTDkBn7T2ZURI2vZ3u\n", - "VXRNsQnGYDxRUokS3YRHs9LEF/gxKSdLEEiHDqcoIHyS2FPM+cIJRSvB7sxIA3hgfN/O4qDK6VO+\n", - "t71oi1H0Bkz1ugONnVTpQr+WeMS5AtXXNBMXU+ycO0+R9eRe9BwSk0V6tHm/HJ45oIYvyWTj3yZa\n", - "JQ6q+o4isbf26PsTbuSAcvQoMnzEXJkqElGJ8Z3rZtdkIzQW0DDnXeNRbj2wQmuUNBknMsWOw2/t\n", - "fD8BErzYLXI65PwTY+6R5c6RWYzF9HNMLBaO1c6cI4yEu1DMKtZW5FrmVuc6hg7VnWxgAgOdFKFA\n", - "QvmmcrbHsqCH4rkez1y5GoMlxeOuW5WKa/JdcefAflYgakEAAAEQQZ6oRRUsO/8AZUEtmg0dqwLy\n", - "ubLYtABfXw0ri+bvSnwBqWW9hB3/jYP94x5LyZNY560IvuBe5T4EX3/71Gbqj7BS5SJLQ7X1JK0z\n", - "I9iR6McwRU2BDEhu+2JQm1RA2fBVxnzCyNr1JVnfyyuumlkNzE8n1UgnkIbS/FMxc8DghB7zqZzK\n", - "rkagW0hHwSjNf+LJf3DnbXyvnzmB1lcv8Z9QlsnPKDef2giSgbZeTNWRMfeu91kckRy0SSKkaYVK\n", - "KUUpf450Vl2TzPLRaNhk7Du1IJzIJRf9supxssXD9v31LAVibgyznyLU/cS57Vr8KEXG+WpKysV+\n", - "6iQmQ/hCoRg82drzuniAPltxm8MMUZwVMGAAAAEzAZ7HdEN/AHUKF3WsfCAA7NAZyuGlRySXJzA8\n", - "WtPYIqCp+udF6BaVoG3w794kSqeP3syNbVlr+uFhruNMOOzTsNGrbATFZMl9DU6mhIXZ1HEAskmI\n", - "VVSgXlz4sVX35JqYrDPP8r9Bsg/O9tAp7LnTMjWlqOdgOPhHpyqf/hmokPsCwqtKfsDhxP/tmX60\n", - "fhM4KsfvpygzK8jmUmY/GDBCISRQeW6U8uaq8guf+cvy+sP09JLJ4HsULhIsm6kyYO04HBdOFUDr\n", - "/8IzlOKX3w/FCxhimlJIduY8iySAFQmALOuag1Ry1Z3p7NpGIGhZp/q5hzsMAsH2jpHXQPdtFNFH\n", - "4VkqDlRDeGqieCr6gwu3hPQQfF9yauq4qf5R+bfPha9tZ3XjpRO4eqNaj2xEQrcb5cIJOAAAAUsB\n", - "nslqQ38Aj1e+ZhXsJE07lvgA5ryx/X3Tt1hQ2T/wP93u+Km2fQtCsS47kHT/v+BMMbdxEWzwYvcd\n", - "d3NYalS7o/aUthPBRfYGmx2hUIQijLOXN4leC3SONeoCputIRor3Lgsy985K8UL4nvf1+pFmRQg0\n", - "eJgJ9ubt7jVqU4S6enDDZ82+hYwxDWOROomkxsOv8nlizRgAHHE1n42Dq5sLIu8oVYp/4M1h4rCy\n", - "m7AmDrR9dbHlpV6pqPLshIJSKr7R6XCF5H/mgt+78ttEoS2XxbrmVQj6DQtTzcYF1gqzE9DaiXTc\n", - "rKcf1aBAFclenBiNHhbAMEE20Br4FIkr51a0ynzJocMgaUhstOH+7gKJGCsTPkykOiVzQeIGOfi6\n", - "AmLkbzIds0NOnV21ExFbxIFAMu1BymG8Kjwvo1cLb7372R2f+Qt5Z8LjmGrBAAABxUGazkmoQWyZ\n", - "TAgj//61KoAWP/AeMmkxh4qDG8hcZFMZjYIY//v8PGtlbWZ+A0oGGFPTAdgmU2TFbrR0QmwUCouN\n", - "e8fq+V7LhZ4IhSGjAEZXRALCc6lvXQaVk4Hy29vGup69bTfpCSIWWGXFW7WfQjL50GRbZZRZHQ2m\n", - "pjAJ2N9/bloCCNQEfrVxCeDkKfJqKlRpIdnOUaiQpsnEysqkLqMfxaCLAtiv1vFXcLPLizzlMPs7\n", - "NIiiAuhD4+CMokPsODEut5yq6fM1zRym2P9iids6rfyvN0EtWlvUXkAIdmS8HfE5DlX5rtipWZ2i\n", - "d9rb+tQcwCfWN6erokI6tARQJu2c+ZSF/sI7qofDkfNVCHii2Msza0cnJEbLkEfdF+gBET2KrdRv\n", - "E5mgO+6ICEAI6O/h7r7DxvTQ9Wxzo3mHNo6898yojVZYUAEyiEUBn5+alz6XfA0d5GcOXFRjv906\n", - "SVSt5h/ZyjXd+HmcrubYPlDuxhjCrkqyrKcbhfJHp/Mq+DI065H9OXdNO/+uDSHvPcKkibqiAVhI\n", - "DqTA+NZM5+PbtXMsqU6iKpSzqr3AN5mBITP84n9JoTkmCR2U/+5h8eajZc3UcAAAAOdBnuxFFSw7\n", - "/wBlSP3uCsGGoV8bqfG+TF6JTvUuRSAD4pZzJUFnxrFOJYnshFJtjPOw7rAcguf7FPJIlPqbN5qs\n", - "fqCPl7TU74m2w4/OJHMnDpS1+crxo620hZORUqqaN/UeMSuSm/KKx2/MSsIgkvOy0fYS1MAD67Fk\n", - "Z5FUhBYQOPZatG+Xc3Icj+kvLjp5v9fX+nJsaNN4CCl0quEK1R//8eZO87p6DKKxlnRfV62uCNE9\n", - "o2MWYwf9qwHYbtyqG6I4xWPTngQnrsOmiw1Sy0bIvHiKKw6nsCsKdLVPqCFU/q5rppy8Ah4AAAIT\n", - "AZ8LdEN/AI9CIO0JMMhrV/0AB0HLuqwUdobO4BdVbPV1Ioua5WZC0IWTaPE/7qAFTCgAnl3rAoSn\n", - "Kk1336t4zGyyPYAAOSIcqQwF8zee7dn7XFk1tvgy6W/qOMTmkEiEdwceoRsnhNmrNp/TK9OoMIUg\n", - "ShyIuwXG8nP6tDCpAEYSuvpzo5kchXf9jICMUEGqQZjLulIdzbNUEecLTDRk1r3gpdToPPcXdXTM\n", - "AElxf3acmkXSo1kx4tBmKJrXm4kNQ2oDIaqLOc1dGZ+ccoProxsI+jQiCldj17rGF1/E4alcIa3L\n", - "dIofRLGOPkev2msNj9eN+tELiQktxoUq9fKnDsRx9Nbc5IkysRYA/KsIu02gpfPyisLPQwjLSjpr\n", - "jTxnZViCfPC6UCMSLVKUvso8AB0eV8Q+lldoHmqd+EeBeeJOkPU3vuU/GQacMWsLnKmVt/65Nw0r\n", - "y1AnL9+YKkDmvNgpqgQANfZvj5NhddHche/p4la1cXWhY3W/jmtWxMTkOC4tX16bao5sNwcVWRvt\n", - "UHjkDIOIXB+3akBV5Lzaef6YjjT1MeUeFh/FB0tOMV3Bhvdw35krP/ItZ1RF5hRCk1oYqz0ykGZW\n", - "YkciBlvCsweWM2wXwX55h7SZHtxiKM3rO4Aff+TOWGbe8hXaapPE+4wKof+j5KoQ530gP62KsQIG\n", - "BV49pf0LYkAEd7yVzO9dhYYFAAAA+QGfDWpDfwCPWoxxjdaiaFtca/OwfG9dSAC6jYuqYuZmzKSC\n", - "kzbTtnf9idy9v7frgKuFjQymibohZCHRXBQdujo9Laqcw233I4Za+//Mdf06kxHe/IBTsCsxcSfV\n", - "ksVUEdqCe9dEwWwg//4Ee8Le2gLXqz21e4jiFyBOjP5GsM1hpupcfwZtr5Mo/ou28BY4QZExXJ0H\n", - "FzCqK0jKq6c//ut1tsd+kiOyZUVGRAFVkS8bi0vvjrj3zga9Zaa6Mt7yQii43DdcrobbVIWdc0QI\n", - "3+rsc8fgmOnJ+GJGdWYzpFLd5zMjS5ofw5IMBt0GmHVcG82Z6YQkqKJHzQAAAe9BmxJJqEFsmUwI\n", - "I//+tSqAFjc3NgONUfiwAKbp/vtZn3NtK6t0V/4sA0MV4unWIJlE1N72EjQeUPmvxOpceaVXIrAK\n", - "21oMRdsBwM4wyEJDPiji6fXmMlmmsCvOtr78Aj8gA+xKnVDFjoVlH7PPNvnMo0iZJruZeFy1B4T9\n", - "/2iVnlLy1r3LZhoykeyNXqaKEANWeqYl2HjpH92g+fHSONko5D2m4SRKJwFWFllUBg2RTQ3etVYS\n", - "PdQGNCLeaZwhH8zjnIe5Vuu46VBC79Le/PF0x5A18FileZQS8Adcvcamp8leUQ9dML537b7ARaSt\n", - "9Lyu3Sdke9BouNe3+hTyxzxAi1Setn//aNMjVtdKZIT0wLvPIMCsfe3gvhpNMtez9cWJYRUO4qU0\n", - "Dlg6h/pUIog+BzidDDvn6SZ9WUgEXhGZOFeOBYowQfwTGI3ac1V8O93aTpJwa/om7scQbOrwAjjK\n", - "gaYt9yqViBt3FWYRIoJJGYqmGJkf0tLvcymA+Hyayho8kg3J33tLzi7Gkd8xVzsn0AbjvoJ9u5le\n", - "OKsB4L1kcStddnytXouu9GStBCQSRLPeb+iGeZTwQ5uYY8D5fTAcb3C6Ob+B7IWRbbytzq93Kz0y\n", - "yYvbeUq1qJCNW3/zJeXeH+8yV69x5FRyM+55j6UAAAEdQZ8wRRUsO/8AYsUcQvOGOSSADI46r94B\n", - "/W+PEO3biH5wUahFid/4E5wZcJb1S+5KPsyD0qQEL2HibG5BPsDLysut2eDJfU6ijjP6zrYmNEWR\n", - "huQfgh9NsMVuoggiphkYt9ccXxVhYHn++9K8YAnkm28Kzp0jUWHgD2VeIoDjCfJPNnBqH+CERm3s\n", - "nubUQ9LmttVf/+MNJAJgtOFW5A6IBAcBpJtd5kPS+zJ8VxzguhOiD6Pf/zfgjMDUsehmT57QUanw\n", - "gbdNgBf1mSXZw3Czfs4swXmaj+42V39PQblTRJ5hVxxBfyBMHdtD+eP+pUlQP8pBAAnf3v75+Q0T\n", - "L19oeS5dx79IIwiodA3vtFf2KOiU2gODZqY3kJGizWNAAAAA3AGfT3RDfwB2j3tYlaKo3hdLneRM\n", - "Dlhayh8NourV4B4kYRi+kgAOdUf8hAGAI5XCPTeroAwXn8G2yGEphnv3FPeZqmLNmvgLgUkPciaQ\n", - "A3x0WVLvMk+lZn6cJdklOXHEnjNKsClw6wU0RbMDBk1zQUzYb/75rZ2h0N0KqL096XGATDutyhUZ\n", - "RVkyTgfbEgHdPAmzdroStgpcOUEN4xVVZX2E+XrryGs2/tIi+iUaglsBszkGSHUeEuoEpHc8PRHH\n", - "tDc+6s5rO2oABm+Gux/PUd+4yoXEBbF4DtdMIooAAAHGAZ9RakN/AHaNgkMVTymoPnXABzXUf7nM\n", - "R8KlDfCSlxubwbY5y13VVoGV2GO0t+vExf+APmeqLrIGM9X5aCQgGSaQJX4OQoECqyNRzFZQDLhW\n", - "KA4dfYJp7oYRPF8AMOzGYqm7AO7w7FtM2J0yD1XqM3LrKYS1dGZTAzMM0YXyhFuS7+8HWwRTCnl1\n", - "B1MtLMYaA8qvJY/AATH13D2takXBcx78I1sCsI+P57X6Q2Nh62/bggQuV3uhAAN0tyrIgbNQYVBH\n", - "gFwoUmXrxaEApAv0P2E40tM9SJDDcZe8DyE7ljCyxGjQA+gKJHzTkZCCQsmlxDg5It6wsdQ6cusN\n", - "DyWnlyoq3MMo7ugMYcm1YMEY73l36Y/R5wo4wUzuNvV2tJ3rSYBCfXsVjc5o1oA8OllKUpgpBG5u\n", - "9AavXOqCqjA07sUF9WlQ9JPrhiXa9bThYRp0lNBazKKlKwsBPK9zJ1/OayuptCCUOtFLyDYWpp2k\n", - "qNXWH8r0IpnJjxnQFcNmI3LKk+rH0vqX+48vd2BUqTcJ4rwX4e+V6oU1+lJyU8fmS4Kj/iQFUx5A\n", - "ntiGKLVWwqfkoYN2YexrEPVBTpKi81wf61aU8NAxYQAAAjdBm1ZJqEFsmUwII//+tSqAFj3B7fR5\n", - "G4ADaQx//3+BfZIcqzxSrotcVc8CLm7cBBc8JifUTg3KyGbsl0UtvUGR3t77PRffuzjjVfcKeiAp\n", - "EmDpLoqmMXTQU5wmHksjapt36fasfEiGyN1dOKyOI9nT0TFFL0pzQSss7Ux5GajOaQUF29zSIoeo\n", - "7hOusjWiFyZylISVuEBU8nCgDYn9P601XpFko2u3FAuYp/svCLJOzc9W7b14FY05eVZdhfmiv0Wm\n", - "d+i5ZPIv9mhB+8Cb50V0LQeFfsyfPeAABtfp/HIPaN+amWONE9vQ2YbC1JsqKljPbi6Vrd258gHB\n", - "PNyXvESqATfkK1Gnk0AWxo7XFr5y0Ce95pJr1n6gAd91M5RV5lL/XAgE7sYG4524aA+cXAa2XPdd\n", - "1BugfbN6YGWbktwAoVIXoUq7TnrmhBrw2FHa1aE9uMJerl9x/Rs847iKP+iuBUD2VIUOVa/G9Po0\n", - "ksPo1bHVIsITIKnrhXV1NabDgHAc5kIv+PJk6IroGA19oMw2I1d4rGiaYQZE9dmK1VRARJ9VXDBJ\n", - "Vlz3aoQhCyQZvwzvxWhVA1iU1RO1TWnJsppajNeO4Vg4/b+BSviIvrSwwqmjaRr8iuCpVTgz+ZJ6\n", - "95zLiSdnoIFqQJA1Hz4YR/KIOmAfhTTnHcdDelso1m8Bx2oHlzAOiYwR4NhSSRD6EhhCU2kXf5vn\n", - "vYdShk1Y3/pp+Wd9yZwIwTneJB0AoI0bbmfrtbbWj1oAAAFQQZ90RRUsO/8AVxVRwqizyog1fzvw\n", - "w3oFk0s5kH60rPhj0qbUv+9nJnU5H1hbksC+yivmpdt3FAylOp/Re8NoooEKQr4q7MX/kjNCB5zj\n", - "aCmG5E3TxVGWGCYMCsdEF1I+HuXX2a3wLCwf1iqCfznNMRG46GE6nIgxc91oY/zfMduLLCzyb8AQ\n", - "b20W2eRODsXd4+7XC1RndLreJ7Km543AdL1iUo99hYdoASXjyWRNv6wvJrmyFngIDlQOrLluZf/9\n", - "T8Y21pcggXpfTtvdj+B+3lZv29AFHkL2xGPZvyL4UyVUgb3U1DWd/iySeGzlK1IbRNu7obP1czi4\n", - "Rchm1nI/pS+cSuamJbhlQHIreF0u2/zcrSGkuOpbObSfAY//5j6RVfcQovw5wL1RQN0tcA1GtFxu\n", - "ZpovaLthGUkeOPh8iV5bEpupJR1R79Ew1sEkTDugAAABwQGfk3RDfwB2dNpntdq7wHtHkfExb8Mi\n", - "4AOIW+6weDVD4WeLhja/JOA5FtORnuW7CfHWfWrXcPJWyNJJfpx2maEKeggtR3RVEAdA1a1truYO\n", - "N3PBvt2C5hri51AyWveiUQtRNh8OhcT8b+NVPo5dLHlfN2wr8ZipKDuUP3k1md+EiPqVCrK5TuMQ\n", - "knvfHHEV8fXqrrFiHhWYrAGbSJdOrXgrQTN4JDv0LMwXs1Nl1nmEdfSgT5BF3DohYi4r2xGfiJcJ\n", - "KMZ1oPHaRBjgxhu40ZP5HqUG5rQWHD92UCH/Terh0cf4e0554mxHgDF9CBXD2Ey6LaV8LB9Jb9nA\n", - "f7tFFMQRIVaLiP+uig+B5OoeaCY5+GdEeHuY+ZE9jNToZ4yOUwNfysZaXJBrtfqEkQosI3EYRZQA\n", - "COu9BHjZjXsKjEmWe9Jj9yWusbXq4WMANyEJEPNSeDcqy2nLsc2OqSE4CgyCqy8blbRZqycUiZt/\n", - "3NpFflI5dk/7eeQ8Uo727U5FhceNm/3Tv/0N3CZNlPGV4f+3/HHJknpIjibzMw4AkTq3Lkxy1XZ+\n", - "FA9yAR3cZ0/eN1EscyudULe5dTvs1EvlYMWBAAABtgGflWpDfwB5Rz+lHWcxYALocP/IVGxKQ/5l\n", - "P8Y+UVeKYTw8iTn+GjVV8vbhgCZ5cI/70wvHdrfJYaZZyRIawh8+61+/vwo8HAkEyAQL0QVrU8Db\n", - "Z7+ORIRATWUQyS/LIyP8q4/O5rf7OuybqgrrJ5JQm3dvb5EYgnYLHCULt4xtpfvTsT5gEynxu9HL\n", - "Km20sO4q1oqcF4MPx2dj7xETa3veUfVJqfvwop/9NWsmPrdhY/wz7rinYt2HcWm7+ulSBZtWIRv3\n", - "yMRoNM+lyCvZDr0PaN2HfwYWOYr/NgyLM3qvI6TujkJkGWBIPuiFK/SHsSPx7iAMcrZ3CQvQC1rq\n", - "psLEx1Lx0vtWsdQAcjEYe6l7VHqUFbgcjcHAYPQIIgi8NauIxLhxUOQnkJo1mXO/e5w2N9AAHA22\n", - "RlXXsFU92TGe3GmYdLlI4OC3IklyabPhxs95veQzY6n0a2BnyANXxWrQG1vVVVAYgtb88NEdo6By\n", - "gCh1aEE1VpUTP0of4shaZpNk/2gd6T34r4uIClLqdADAAdaA4/epPc357p2Ro8OkrT9okATGaQDM\n", - "AYBiPC2kAQBkyn5ImAAAAdBBm5pJqEFsmUwII//+tSqAF4In0o7iUdIU6DQAMu59v/f4eNbK2my3\n", - "LFfU4bVvmOXvurgANJp+yhdNshfKZWyf1yiq02eNo25TtXkBg+c9UZquU5KtxkSr2wTyRJb5fWbg\n", - "+NL8Fosje7XYkSxYEiB3sVwPhHSvNWh2d4v6fN1lP9qvuUnfb1Bn+TdruqmJdM2vx9efbO5Th2CP\n", - "KiH3jeuRzoCzSIUG7cY38FVzT4nUIJdz+2KjjjJ0E7ZNKQ6lROaPqjFN4utrXaZfqGFX2nWmlL+h\n", - "PxS7plcEcSC1oWpbRWphWgodqD5c2VmFV0yO9NkxWYeDoEeaPVORAB/gqWAbIHdoZVHMBBV6fLyv\n", - "D3u5FppjGB4tzB+WC5jnXJKg0Sk3SkInESay6cwWUVJt/G4Tfg6wbMdEkCvCKlRosg/RTpp5P6wR\n", - "Z2iZfctuN2EQi36vtriULh4PVI/bw9ZXWlyhMpAYPlW3C1NvZrlJMNaSqGSSnh5cJMfrxHquXcAN\n", - "CTgojRhZ3tMe14Ny/HV3UfnpEJgrqxN8KZxlRpYS28Q96uqEu6NBBsBIIz0ei/Mg1x57c0aguL4j\n", - "dVBDXATm12Zi0uXfiRBRiIror0O2CDrlUQAAAPNBn7hFFSw7/wBgSQL3wIE2Tv5B6OJXPcoXMcSb\n", - "cE8qv/1v/uy5HaAJNUQCTSWlcVovOwe/GLZOdN2BNEgb1OlzNEinzyASzg3GuZ9zFeyJHe/zvxXW\n", - "qHgQlhmuH8QdE1M1s5tXy5mwAyoAiCrzupaN60ez6jWL/yRvGdGiPt3qJJLeMG60zAMKa7QhUJFJ\n", - "FMWUFrcLW6iQXx7VTZR7Qo0gz/aCe+BxT2h34J4bdpQTH59SHjOd2X4DMr2kpW5buE3EQBEKSUD8\n", - "yEiNy7MVRtsZHXt1V4Pb6TljTGXtC9pzGwEXtgadiRP8dhtDjxgpVN3IyoEAAAFOAZ/XdEN/AHkx\n", - "u7J3fsEfo6cXtbkNOd4swcOB3voAJyKHu0c0/MGiiYXv+2wca3XUwSOEG+s8df2rHPxj/J/Armyt\n", - "j86AAAWOWZsl8AgjGF9fWv1mQf9jrWNuA4APvfeLBFbZJZm7otp6Fc0DFqB0XCbEvLTkRU5ySc7e\n", - "Y4CD3ziWyxgWkLgxNxAV0V3rzOqUGhFxcTbBCJI75knYyulzgB9+SazwgLVSR2N8nND844Y7GLCN\n", - "0aeRWZgNIAWJkPPhP1VnSRo1jOpV+axgAXL8ExpNwIvLk+O8lekZ0/1o7sI+uJ46XyI2SuA6uJHd\n", - "bwUKNMI2qDKAM6f4kKlJLSQWqzXAi8hAQzI017i25Vpi5npQJ4TsJeyOHRvmO1wY5ZnIEZHyhgB4\n", - "IoLWrdA5opbAou9XxH6m1F6osqepeJLd97Dr7+5BqWzoHoOLhOxNwAAAAQ4Bn9lqQ38Ah1fDGltb\n", - "SoFNBABy4LNe514R+dnaDTYn5E46OmsRrJgYyAm1lSXdflAXI1+CFQXE0A4eKb0poyZSLaaXfRBJ\n", - "r/tA3jW8xYt/UxFDszVrqnPHP/Ny6pw3mJ+pwWr+YYAHxNaLyZj85nxRNPFMUkOr96iCB+MslYrg\n", - "cr/vUoZCrrFka9nw08yFJlyN4Ky9KHUYJOXDrBIiz8KQQaHFalCe3rENKk9raHLB9E2PdI37xydW\n", - "9R3Ktqa3KW5rMJCOoArO2/3trkkCh+/FDlbsei4VdbDQ32DjCaAkDFjCyuqOJNsi8nSI2KDSRFCB\n", - "83l81kCObhPemVMTlMBQzSDvOtDFUtuVwHtirD8AAAFqQZvcSahBbJlMFEwR//61KoAWweTusUEY\n", - "AFR7WLigAceU/KgvW9LBBRTRioW652v1Xpv5tYMFhkRmmlUca4/8lM9NJwOZFgbdLq3dhRjr1SQ+\n", - "iitgTnIKVe77qt/yWy3INzcVxffYfGucVy2ypyvLSUZVvVzu37Ufe4d1uKQAC1EE3Wwzkx7sEK4N\n", - "QwJyCdTZZnLiyrlEXcLAMbB36CvMtmCiaP8XPpa1U2RaJxnBB9qYeP0+JCORflaC8m/hyWfMppd0\n", - "XeCFuAYTEakC9vO4HVF02QH4GZZigg7j7bXnvstEtP5QgYZViZcOoAaQGKtWm3PCHoS8mKWfCUk8\n", - "ZLC6z2a10V0U2DavVH2m02W1Lc4/2WzrwUTHr66DOaP+urnPdabeHdXruv1HJ087InGSipJtxGko\n", - "4rppNbdlP4z6g2o/ksCKcSZ76uS1diKM/39wzVYDu1tkCD1lomve9NoQwUToKqCn30PDqMAAAAEr\n", - "AZ/7akN/AIdka2XuDkeawxOj/BZhZtP+kNbRABb4RmWT8vSOMSH2HVKuz5/n3pn38gQM6YQqY5bV\n", - "v8KsLMWKt//3BpX7BUiSjA/GsXEpiGachc2o+KqjjRfujy3SLc+TvzNfgePwT9w0Jj9Y8j6ORxA7\n", - "13x9/iM5Lx1s2OQQyRluiOYKxXDE9QjNulPCcMLJFKpvAfnZmzl0pzzHw/ANcBEDhABHQ9ftCkUs\n", - "Q4pQOQF20mJ1++bXoRcUz/lR79ACwohpzpGuaQCknCVhUL3lnnyQzloB0PAIRq1VnOd+y8D18t8/\n", - "IEva3L9FTrRi90eT/2pNxjMaqrOmFzrhjd2kmSd3YBlll+A3KrjDn/HtXx8SDjztM7Km7BEd2LVO\n", - "U1pVGn0+C8gCov9gxoEAAAIMQZvgSeEKUmUwII///rUqgBet471BV4xl2QAFRvb+6Uilj9hVaCt9\n", - "oXOXB19FM5G4bNDJAOl9w7HrxMOF2dPOUf977Rp9NoBObCR9cN42Ht77Y+l36qfp5SrWPFz3DG9k\n", - "Uks1s5yfRvMME5RxPYk9+qohbe5TR7z2WNWBJjaTvhnu4485WU3BaTyIbA4BRRdj0/JwsbCXRVZy\n", - "OMmFdXnFdxhNGZ5JMCQy+ip435WTv8KevLzG3OUTxX5d8x0gaiQZdaPwNC9GVrgmtqTc0z7He5Hx\n", - "p/UnXiE+WgHU095CwXga4AbeOtQbj0tjxKUoS9sAoJ5fyTlHv9FnU0ujgUuoA3Kj0ma5qF69zgnv\n", - "MTXEIqf8zuYuInk435YB6s5Aa1W77q49/ZLR70JdKU9F42nWnuaGIFvaX8JNp0NTGvA0s1VSOWIl\n", - "YVdpY6hSPbDqLYXO/LE7X1D3sWpexh+/kcA2B6pYDzx14bD7OD1f9pMDWxIrW6BpNH75M54gOMY1\n", - "SxoTsfh6KVoyFK4Yqd6lPKCLY4O17tm0vzqLEva8zNeuM7b2yHKwMHpqK8FV5yaEer9Zd+uSgIqd\n", - "eftECExc0GDPrda1mDLPyRR8iDjZRvRS/EElnceTaWiUEonB934ThxItQqnJINdKSyNdNwx44Jgq\n", - "H9/Zh55FLA3sdVDr+1aesKMfNmYnbwaje7GN0y0AAAENQZ4eRTRMO/8AYEUc98FD5/CYkGD6VZTK\n", - "7qaMD8JeD5Yvz1s+LaCSFWcn3aLtkXWLu76WBTjEp2boTz2lISGgYIiIhTqGBdSAvn4GaApcqQ2+\n", - "sy0LjwIg9aZXDdjP9AWFTV1H8wY3dWCf+Rn8X8p7dsAFRxXZ4015PG0t6STtIq5DOqARSPJ32oCq\n", - "OenP2L2rQhT0bU7kBXZqDOvuedMFko4K8dbR3EOKtstAjt1gHGNubjQIVeNhJsdrdMtXEY7juX3P\n", - "NuPteAILXrR8S3R5mIOtuZ+vWEUdS+Inr7FnZsbQiIv9i7KDzU2m3LJLNdjmArFBBLgFXYHDvQmL\n", - "9VT51Mb8gx1TyNar/CPWDggAAADyAZ49dEN/AInJdfYNr4ilmYSAMFB4GADpypoeWWXE3q20mGL8\n", - "wfGmH6ZgcbtTXJWZn5/uB2IPeQFG/rqNYZ/bmIUcKhccFRuPa9wOgu4Qnm9oi81y+ChWQK1KoKDK\n", - "TWWDeg/SDhV8w/q9dFY0rcekgnjPKbKFgzK+IO7hoMF7vhpMoVCqvwMtBaesBfF4bzxIufyftMba\n", - "VRaJWuZpM22/FtH8FxujQ6EjGNr9PHZg3rsxXbkYHRqZvH6RGypNdfKRL4serPMKtCeuCWEKaj1Z\n", - "h+pr+ULdNvwpLLHfA3OCu3Ql8v/sLDD/O1LVB9ug+l/wHpAAAAGVAZ4/akN/AInJdjcgUcZACEqh\n", - "GvWiTtr19IbQdv8WE1dBOa+lNipi00vM+C9W8F7IDH0aaS+KKFaekfOwUNG520lVemVKNYbjnPl7\n", - "LimE+s4N2NJ5SYT5+XRMb+vTvKCkG/By5wQO/WbZo9HorEm10+Tu4CVIj+2Ky5hDZl+kA6mkBK7E\n", - "3LwAW+4rGYiO9JH1BLFQj0ZOJq0ybrdVynOYOw8TudsCI+I3fiT5nmYCkIO1N7h++s67fASBLfgP\n", - "CYo7yLNwfifRM3ay+JhoRmwX5tGJ8l9w676Zo1wDaqZ0Q5guAYSxSJk2jHShR6LxlZmIVJnq7S00\n", - "iBOM0mxomzMhjpxeX6zqy/aA2SEREi4ulxZsEvlIWhLQ5YFv6LMkVEh9RITRQOsKGEls7Y4eSRWc\n", - "f23FGWOVxL2MZUmPGVh++Xygx19XCiXwoatt/s2T7zGfLkQ2IBiMKXoeDb7yiR4q+0v6UjACWT2H\n", - "kOIRMpG/B4KQPsfMRT0Rk3cAwV9dNnKm4XTlo9P9TmyT71B/Greq+KvhEBDxAAACJkGaJEmoQWiZ\n", - "TAgj//61KoAW5ktFwTkgtAAhBassVgP2a7WSOTniW7GlpUC5YARIimzpboyDKn/53KIxVBS+A0NS\n", - "3NuuWMzq53zfHvhoSdYO4dYooBUDN2VkLpVK3v3kQo1FoE02X3cyV2j6ziOTJORgWGzqU5k0XKJO\n", - "1VCPDS1gJclQYem5NlGAENmSiR9I8XvNQLGvpLGF/2+aU31xCZzIPp4tUxyLu/gVqq+6L5DezfDz\n", - "gPP3+vv4JFttE5Nyc7LysmCaQfUhi6zPymHmdLjs3bZdma4hV61UMMsGBNZfYf2GUkV1dVZ9kkfz\n", - "RyUYJPFdwjA5S++T8sc03o81MYXnXYkO9hGiG6RRLRRV2fPSgGhghnaqxRhYVQiuVS0ENIpjxqqc\n", - "KBEaAMs1VoaLKEOrNhZ8yB1VLLV9KSiM7/prkkNKRuNLp0WeTv2eHtXhIdAfhKb+ic7Pb48CqpOl\n", - "FnnbgphlxDaS1dplrA4VxMNzEL/27xNMQzhuRvnSDNb60j/kSJHw5x2JG6G/VwCoVAfFrZll45AB\n", - "Puajv4y9+7flMd/pR8Rg9UAn+cey+vNCcCbbn7FNSWq2hl9cymk4fwW6iqBgiFEQ7YZtyDoNCyYz\n", - "KAnW0gvHCg+5n6+qxC+xDS291Y4JfSW927ZZudU0tXxvupwcKf6fDXxz/bqsOMvxj6Y81+e6Dezh\n", - "B2/8nCpk1Qc7N5s0JoStEQ8+K2ir0vIXayhFQIgAAAEeQZ5CRREsO/8AZTZTJbuKD3PiQhYpzA/Q\n", - "3Iqsld8XUz3sHppFsAHZevvXPBLN2cIUd+YCbEEH6MplVFEcbuDDV0dnlBcrCNrbp3+CAOdBsr6h\n", - "0YfLGDPxHlFlUCi4qTS1o0TT2Jzkq8/O+TU7SSImG1EjEmOGpKvxjn7KxERq2Pbd/0y1sNHk5hiQ\n", - "eJwHwc7Z19aIrWes4h3UYQqHeU6kfCpUHVgnGubU2A0Xjg0UrouNSumFogz0StLk4fuhL5slF3Bb\n", - "3NpP7YhgiVLV0FNM21/pfbXvRQFzmliOaZuScgePqa02nvOdEHEpGVRPLCGL/tvzSkZqhXResmQg\n", - "1qZ/TxlvqjWYqPRThBIk2nP66jbd6NLagdWz1BtbrwB3TQAAAVkBnmF0Q38Ajz7dDL7wKLyRAA5r\n", - "u/5Co2KbB/AnQg3XvWeaImUuto8KuobiZ5Rpi0jf/+r5lFprj/mYxpQ5OwqjQqFG0eXwqi1D6M23\n", - "HLH/3LvgYXkbAAGr9uWkQaEU+TeJ38WNXodDC29t8Y0uYEpwNzyC6FqtgkCyDYDpd/nESpdVRRJh\n", - "15SV0TP88AKwZsT7yWH2r5gpJv8AhXnnWmKJ/WMwiS/2+Kf3ikj614P+BDohXhMYGO4GSZ19EkRI\n", - "RjwO1zoy3Umd4iOMuBBPzevAs74sU7IUdkUF24rNAstoyqnAUgY510L3SgPXbZmJYMv+tRpT7ZuM\n", - "oLxE5ACIQ+eHStmGZgh2P1nvrIaZRiBxoWZ1B+DDOtu5OZpc7LbajGP/oy8HbEFyJIcGXHGB5VXY\n", - "HnskMmabuu5xyFIJcVaqbGg3TlqrbBE29OX6xO7K38oavU/okVlIM+AAAAGEAZ5jakN/AIdXv9ZL\n", - "/wCpeCQF0zyG8897iu+TVNq8xXl3pE8eXm424VBKoADmOQ/RgBgC6Y0IzpqUKPVKwCZafdEIuhUv\n", - "zhgtxewRpr3F4VdMy9NUqqvPfGroLPxDW64Af18RtCEv8t7amX9ezvEWK8AgZjHjHXeVi2k8dp4r\n", - "TuMjdngEOGe6y0V0qXE0vJudyGSblaiStnW6rV0e34JxbdN3Qbajy6ozlLfOkq7Wqx1iLXxa4foY\n", - "IPBIjzxdye8gOjZW7bP0axd+wppVHkXrrvuxUf9dp18AanJIIFv6MCm6ujRO2wyu4ZfSbZp/KVFm\n", - "xvxpBAJyjKSdCoPxWylEDyms9NAmwAADmUiy6WUOIsiAC130X9MRKfeLHi3miJh/YDGeINuX+P+e\n", - "NWBXxp3RqAzo1eISPcPztmgXUHCSN2VRpnCOFQoF4yyryK4v7s2U4a7V5e2sVJBhb7kguiVFACK3\n", - "rbLSCnWI4OCs6u017nghnGW3Juq0rF80iqmo5QCt19S62wAAAkZBmmhJqEFsmUwII//+tSqAFu/w\n", - "HjJpMYeKfGxaFh4NwH9VzFzipiNnWLhZf3lim8qQP0NcWviT9hCfSjxxrnYEE59yPQn7u6+tCr/u\n", - "vn8/iyWB73TxWIDTyqwOWzo0R8Wj7McP4QWP8yE0svd//Wkug5+3cHmcpP/ONbeBn+TAQ0VzErlc\n", - "2hXFLnmGW7EB004qvGi/S7JfG21T+V5Sx9Nre0PuomioWltV0uJSYiMg18UwZktQhoyeO+qpPgky\n", - "U9/xX6NUrUyAfCz03v4wSV58lpzV7BxftApX8ZGWBx2zWQV/YeOCEWbmbHqvN18Jd5FxK1iHRqe+\n", - "nBGg6SyBQEQQfCMxCo37AXM212ulRN9X2fE3P9HkhvkaOxQZ5AElyFJ4BlaM9J8bcUgOX6NS6Cqb\n", - "n7IHMcCIPjAIJ36atWVr0EheDYyrwatT/sRxqfSoF0RgoVqtGqstMXZF7XACu2N9LDV5Ss0B+mSl\n", - "kJJqGxc50wazbtpofP341QOLrRCoQigLO2IFkJyqTpln4FgoWIMbx8x6cKkFmIESXv7mZEx6LOrL\n", - "ggZa/EdzllkBPCO/+zBjmey1Y55MrbMpoidNDpdQ6yZ4UDU0ai3HtghNjtrUaVDC+dCrSCASLB02\n", - "bO819PX27qwUTWW1MCrVhUzQkUkht4Xa4bdnUW7zTudPa++EPxUMVY36vPDJoCGilCgIXzTOV6S9\n", - "OVTh4+OA6S/XkcoA6ZjbQLERX5kZSQMoFJs4bPot93titzpDSKAhc1QMx6eKK6Ol2IEAAAEkQZ6G\n", - "RRUsO/8AZUEFdKFRxHYcrgnLV1IJewAc5dAL6/Pr5YWcZb4ejev9b/lpY1ea5Xk1AlTe44c3rPkF\n", - "DXI6yAdEC7kxPh5StAse03AARSF2nro+Dr5bfPJyYF/ERJ9NScPmUIVihvTCsyh5qmuoAH9P7eCu\n", - "Y8rdH1hF/pTSa+Z1tzZc8gwGtgV/YsMtlWLs3VbLWxt2KTDW5Y2b0HA6zgNn25rXu72r6iiN5aw7\n", - "sjFipq/8rjgHE9K0EK2Opn+0SPK2Rbo28aoNdC9V8VxW1CpMNxKjFOs8YmQmJE6Qtkw+Uo5mh3ic\n", - "7Ng6Xje5wAF7a8Iyr8DMIwvMZnnVp6ilQ1B/LSGEPncviRIHH8w83Grtt0CsL1L2isuyMboY11N9\n", - "lxQPpwAAAUABnqV0Q38Aiz6zZgMl5b2XXQAXQ9yHCqNv7FVD9CxHdTnw5pqRTLAoFiba5ss3lqXG\n", - "QCf4/o32jzmzNKjZDN2ghdo3OS7n/NFKTMs4yX0NTqaEhdnVRvrbcGvcKo0NYMgzE8UNwneueU22\n", - "1vpuKbOkae4P82iS9XSi8TlOPcF8mmD+n9qfVTXzL4r0M/s5xxZempvnxqhz38EgmSM/Zw7kEyiv\n", - "giyuP/YjNhFl3FVcOSLiQTCj+F0nLUE7lia+UkuO/YNBXwUKZKD8Add8BG6ZTC4bD/RSktc7uv8w\n", - "NB82AXgnpuELTB2xZFOLAYJncjo03/3uAK678Cl8cw8fzlbnSpp5eUkHacCUtAY9LPrz/OMf2bA9\n", - "vBE2eUwrxz/W0Sg0tjzkUrpnJSF+xYsA2fgRolT6A0NA++mVN8PJVhaGzQAAAX4BnqdqQ38Aj1eg\n", - "HO2BrhbSJp3bjAA7Lyx/X3Tt1hQ2T/wP93u+Km2fQtCsS47kHT/v6cxSu0EEWzwOVr17m7uMIt8s\n", - "rOS2NL0s+wNbNsQiUhFGWcubxLdtukca9QFTdaQjRXuW15l7gz2QnuVPe/r9SLMinrQ8TAT7c4JB\n", - "GrUpwbYY2wvPKUw4NOIKdjGz2TGxM02Yhqm+YQD7nu+MPeXg/5dBf+XeKfPK+RchTbfnRfx28pUm\n", - "+MUq+ynmpWVmmfO3TbD8gZCbZRUeK4LOH5lP3nvVvkbZlQVhN5vPlxxNouZsDfsmprxmWrHzH3vb\n", - "E+c7VsDA88L9wCH+ZmQGzxFjyOQ8cz4P9rsZSuU8vQS1h6fmk4XXUosrmweEGKJT/Sv5qb0OG8e9\n", - "voRxFaPrroiqkALWSnA5n4zcQMwfY/xXX1aR5rslt9ItB406qJIsbsrkl8pXUe2CwOVm9B72bhd1\n", - "lqsCRNktqyPMF/Ek4JsxscPvDjbSqbQZL+uT8zjgAAAB5EGarEmoQWyZTAgj//61KoAZQB+OVG5p\n", - "SZHABUb2//v8PGtlbWZ+A0oGGFPTAdgmU2TFbsuJ6mwUCouNe8f1I2ythN04JSJ5lx+ik6KpnC91\n", - "1FD3eD5Jit+kJIg5holbnldcijL50GRMV+Tt0L65TPBxqSAUdrQu+eLUTHPpJCL4CV5RJau8pEIv\n", - "uK3a7QA/UMQ/nrDjeZ6jqf1BF3JjbyaeIc5drvnYbR6lQ0gBIzp/QRU9xrHm8FESnIe42aooWDJ9\n", - "bVMccs59QBQd45WisW0MXV7NFtyepgfK7biPJN57MDsWL2A4LYHAXH6f6In3GVsSrYQ2HUKGlxpv\n", - "Yf/Xvk0pBnHsuIEsslXTjxwTTzuRb2YT7QCJp6yHiUVL67n8RfvHMNoHfUzP4rVgPSXcPL8FOP2d\n", - "F8GxovHNOmsOSUyc+t9OZXQFF+4FJNSN23FsgARohBEJ3c1u0ax3ACLYlwfCd3/U1mT29ftZkWMR\n", - "uj01t9v2AGHvgKM29X2Vs/ALzLNDd2OM9z+AC4TlcpgcRujIhnjHf17Je/8RMBqJCZtdfrFmz6AW\n", - "Z/aNIv/p/WX6adpvStFWxoDAnf+Tai9COS20TO4GHDviQkpMo6tbNTk4tiYWsmvBNq5u/aO08r2y\n", - "Bs1eH2kAAAD6QZ7KRRUsO/8AZUj9pUTz7rNMoHjJ4gSsLw2wABNFEVCVBZ8at73oa3C8UmeDMVba\n", - "M3uHP8p2EFDXTkl9EiChbxZZgpuvefKfc50lYhoTJ/7H62X0Z9NX2I7S32WT1XJeJtD32zfVBu3K\n", - "VmE+30x6+W2pKnyMM0ZejDKLq8WyIyi+9rC0QVVyU0N739nDCyt6aqRfMfSdljqTnwOmgDB5pHyK\n", - "U8Nf/BZxnIET5uBVX/VcS4bjmT9sCYYwmAz5vBy8cv5J53FYPh0/wF7kP2myhm8SfTnmNtpTej0y\n", - "JjLbrdGSBUAu+lwbCsr/YdOCYrxvvrklZP4j4s5VlQAAAgYBnul0Q38Aiz6zZf6skuDOogA4jl3V\n", - "YKO0NncAuqtob34dJ/eVmQtCFk2jxP+6gBUwoAJ5d6wKEpypNd+AlIf83kNIAAC8trXyGAv3zzzV\n", - "tAa7kzCHOXS39Rxic+qZEHcHH0Hx0iIZnH1UNeoS6dQYQqolDkQpOXG8nP6tDCpAEYSQsJzo5kch\n", - "Xf9jICMUCBjMQXeVS1i3FdA07mrKCBowVzEdee9WvqvXV7KuMTufiL0hA8BHvtD6VFvEZ6eiqgvN\n", - "8RNM5cYXQ2i+4Lx4R2QlAIN1NNxqM8GvSjSh/rgipqY8DwHJh8p9Jbu0Zs+w86pgxJN8m/cvWxRZ\n", - "yFAtI7sBhDbJnNXx83ll0o93YVJhxi0TxWXPf6PlHZeEyvr6QOF2VVafQjsZUg34P/p6tj3lkAer\n", - "aZouLIrbfbTrpoGdtXuXR2qC418s780GZsUBVTlvppC7dgGYqQzB5daoV61BoiIg6tQyG20Yk/Ib\n", - "TtwSJmeU5Eiu/zRo0bpbU2jgV79WVCB/SVzxsmoD1jJEhzN1FHxsbajOijl9Vp76GofsezNr+37n\n", - "UWWhPPzCk1rCLQgaI34ekcMUWq/vBK2WDe7wKACe/5M5UglN5Ct9Orsd3SfYPc0336usW56marFA\n", - "xW2XgVLc1GludnoFyQrT+oASHSl68jJc1j3I4WTIeU/p+eW8RtUF4AAAAR4BnutqQ38Ai1egJmdK\n", - "YqnGBlYUAF9obzNVJ+s4Wyt0Rq0YuZmzKSClvCu/741bUzMW9+2RqBxHf8xROd9WCD2DFO6m3iiG\n", - "ZOgLMC6WQsGlrWDKBATBQkW8M70y/ztO1ZzNQj1ow5FREW75+T8qWeYnaEkP0sDPfhS/8A++EHpT\n", - "ONUZpoNHugOpCj8EFvE/MnQhkWbqDB+V4zYJeD+V1h9PGTTPeM5Ykyq4ZMi+8E5Gka9dd2CFXMaQ\n", - "M99mRo+FOH0+y87A4U4JusoMgrnGwBHn7tNdR1Jgk+wKYqmIwBj2jGPnQFJXhHhE3ZkpIjaeakM2\n", - "8MH5c8xC359KRjK1nfiZHGSkxS98YPps7lGGiAJ2WdM/l0XaVpItX1VPHy/wAAACGUGa8EmoQWyZ\n", - "TAgj//61KoAWNzc2A41R+LAApun++OIZUz7EikV/szjfxvYPLx+f9K2/F/he8DHawkBMdV2wRLxA\n", - "t50GIuRUSWE/39Xo4nAQqkjDTJdufKMgNIx0erMAcY2QA5ejjVo1tlzncJOxCqGpuGwA+5/4IKyu\n", - "bmTzdPecTw0ZdpVPq5j/sb/uUTmyS5oriK2QJUn4uMhurpWU0pM90BFHxmx/55iJQnC/E4AiRjGv\n", - "TSfvy9eol7L6q3/AmWDGKQmta5h6TQecJSS7keMMTmFMkcgh+dQEUTFbphGIZpTz6vxfkWPPyqpQ\n", - "VmS0gectGBeLssajkGiu1ivhXeMUvGnpqjpc6XSD8FJ8sVdfwdsse9JozsVq/t5YFq5+AnEYcopl\n", - "mlIiLVwif6/glDa/FvPVZyUrYuYY9L3TA7eEHe1IcHWSOPxpnafEFBrVGoeZPrbfymiVcHOQ/3CX\n", - "aGrpVwdWrmOHr8jLuajUxWOW37ajHobcyT1hYWMxRTx80fZmsfvsrNw/Nztdx7LidHGE8jPZ4gQZ\n", - "DABlByR/bof6mTmjqkfbsR1PCXy4RDNnn9nCnaSnb8pCApsF6YsDTv0+UmVzx2ZPSdm2LhZIqOim\n", - "mhiXHWt+ZE1dnYkLwTdsgNYEeAUTjY5XG25CAykSMfKGwGWeeOwqKmLAqTmb7mCXXxxpy4+bbELo\n", - "RAxOLFOR7z+Rlt4VIVMH4QAAASRBnw5FFSw7/wBiyP2mEJvZyVx6ACpM7CM8ZBKHKR5j7ndOem+L\n", - "X5lQTliSlHrc19blDxI+BarmPxVVRFr/CorqLGvI+vHNUfF9L5rOth1seL+LchCRD6bYXJMlctoQ\n", - "KBnrSfN8OsFA3rCX0rxhgXIKgdEDuCNRYd4XCiw0AyO8VPwgQ3UKQOwN4T9AdwOVZht3xWSjlGSY\n", - "LTfR+DOcni9vpFUI/V99yTFNeriW/Ezi0Mmb4Xp+UrrTAn+/oqePQryHATZ97i1I4TzdZJ6ol421\n", - "ZZiGDIa6I2z+mz36WJISXYfn5PcaqZon5evy7wkHdXdLSXQuyy6RoW3UMK1kv4eYGMx6MEUBV881\n", - "1DxJ4Az2tfQhJ60iq3lK6xGARpoGTWiGA3pBAAABAwGfLXRDfwCHPtdry+v+2nyY2Sk+gF5YW5HN\n", - "XoAL6QRR4alJgXnPRJGLu1H/XzBsCOVwj2OHZ7/Befz18ioG7PdTUWTo/DFmzXwFwKSHq5MESJ/K\n", - "+czoaBaMU0SilMUvvgF9NaNkzEcYOJjCpUUkl+lvc9iWY7aNcNT0YkO2YuPLl1ZJa6XpXyzgvJfC\n", - "YABMMMlHP4hWdgac8C4JyYJle4OEiXwhanMhhDIkpZpmZqqPP6iXGzuSTb+0ZDMJHqoDGqJmkb8S\n", - "IJuvyZGNE4panvJTPVd9f7g4/aXxMPm3Cn3wfT3mTthI056NzanOEWKjM1qGy4olpTOi0cV3zUKu\n", - "VGl1k7sAAAHXAZ8vakN/AInJcXImIY9AsY+/nZAB2XUf7nMR8KlDfCSlxubwbY5yyAvaK6FdhjtI\n", - "iTEMX/gD5nqi6yBjPV+WgerMVdQiwmsTWCh4ZDRMTEvRNiTK06p6H4BM93iWfwAaKh8Gz9Gaukwy\n", - "InHLEZ0yD1XqM2twrrM9K/zMIWUOeN0Z6Qpdges4mCaPjYBUMA0KTxEuHmES85gUYlt0s0Ks9Nu+\n", - "2hfyb2t0rmyvRs70WgBBgYrdeTZMCwmoCbRHPK4oxsSlCang/p1gu/DmbjnwYRln/v7ufz7R3gdP\n", - "Fr7XrHKEZc+f98DBxQMF82PBbmDGtLAQXHwptz6g5mqHfaJhvvgj78jkqTGrQ4WXMBaKzHGNvGYe\n", - "XIR0bHtcMMQd0uz0UHs+NS8bhlZ93PGBn0DI4S7X4qFOiND2PCIg5ogjbfFqU4Kuh5oLH4L3vi2E\n", - "bzWP7DaofhwjMqjCqAvZAgznNJDsvnJzQxJ6Pqjj2ny04t1drdQRUisSLN+PcLenLQZbe401Xg2H\n", - "yhW845ouHrITGSqb9EOEeoN97gj42PjsdYRMVLRDVvCV2BOAqdLbEmICPHZnyy75qPsejK7duPuc\n", - "fJ9rEnjynB/HxYz7zf/RM6xyYbzIoc3AAAACEkGbNEmoQWyZTAgj//61KoAbj1lLPyvb6PAZgAh9\n", - "7f/9/gX2SHKs8Uq31kdycpXc3bf6XPCYn1E4Nyshm7SbxYTXwR3t77AgzFtBuE6fBgZeY48yXmAW\n", - "rqOr3iMlgArjVOjemrjz47grY/T9rKmhvhaqPi8pvZTzkzZCl+tV6nzXVbBFw15yZW9xk2z611V7\n", - "GITjv5GH4Oi/06B5IbjEMVKEcRpvt893HwIyUBXniM9I90uh0TBxOedvsxxE2iLZsr/m/GNXryb+\n", - "9as6btju6GU5FfXHAHKy97PxI2Rac5Rx/FoPiuKEecRx7EQrDfRmlggPPP63oMY4jkBeTzC7Drwp\n", - "8ik2Z4rhoAMWlcRPfXCI56oe4Jt09oRInuaD3ww9/jGDjhHIXGbNYM/s5UG1XuYLCqaLxESIyPG/\n", - "eNnETthXX/QZDvDCFX3YINANkqDvHlUQ+vcUvksaWF/g1aVcMu45c8BoP1coWBAVWVE6iyDMwfYl\n", - "RYTcnNfp26mpOfqiSJnYH+AFj0qGJttgeZBuJCzdV4F5EDreo0WWAiq/0jdXljJ+ZxDij/UazQOM\n", - "0ct15Q7rTOqLKy+lpOVa/koSWj06e8eyy0wY1FBSVaROGYbDgXze1QzYiVyP6+WTk1fjz+Do+J+/\n", - "TxVlHJsfUOz0tbPJ3R4cSjRVigTxPg9VAYynpzzMlIr0/pCOGd4XYyl3SGTwAAABOUGfUkUVLDv/\n", - "AGU2ltMhgssRVFnYDYHdfwUIOpARUIP1pWfDHpU2pf97OTOpyP7SrW+j72yMHgCy10/KQJvVenOE\n", - "eMrSHUfyq6lVIsdEDgl0M+/NXx5VMpg+IZB+I7xozsY2f0ARjiAjA8ZSqG32YEqaGwpGp+vfKL3P\n", - "hav1CfnyaUmopPCa0Y5ww/PZN4YINPOwE+Gg36kaKP/ME/B0d8v00CzvLXmI8pIa3TqrGIa7PF4X\n", - "8miGO6oXkRH45ag0gFdgkGj+BD1PvtIptIkuqTa5jzG/NewDN9cCfws/hjc474K6NoCTyr++7Tth\n", - "LSIM60DcVje0csuhEMwOmCNob99l/AJp/9hMVsVsEaxUNsWBZFMKnZoLJU/ljkNlTtF1zcUwJoZD\n", - "oLTT6FmWVzlFnyfjiJdVIqMAAYsAAAIPAZ9xdEN/AI8+s1VkrBucudR5tN1L4cUDsugAOgW+6weD\n", - "VD4WeLhja/JOA5FtORnuW7CfHWfWrXcPJlwit0rQdaNL8wYmpMOBxVMKErdopYTnWfb0EZST9ZFP\n", - "kGeAI5wBNyE7pmk7U/hz6/Uncd5yONsvInzdtLdlFGIUuwPsZsiC4nxcPKJ4ER73zqMcPC62dMwB\n", - "YeP2JTSzcWxmsY8AuUeSUMff3wugzCWo2dZWIqj8MEevc9dnI6e4RX4rfqOmeKfJ7QFxuPllAOzz\n", - "FkyERujhdmr2mdRExctZgI01tg+iF/NwBCqP+hQ0BZaq12BgDPwBcWyuj8PXGo/75aroqbic3atK\n", - "78lcQoP6TccBH3q4TpJbdFKZCXZFrS7Hh71ZQxzuADlZ8DDRzGHyvFJs8+7LX0Z3SVEeli/7hzNR\n", - "3en2BovQV52x/rwTox00ojUHS89/I6QK5rr9xZ5z1Evdog7ewBETCofR8FQPxE+2X576ofb9SYpa\n", - "RU+FFWJ4WPQBj/u1ljXdmoINHOgs90YcpGG37DHSgRaxKh3h9samVWdsr/7ZPH7Krx9nfE8zJoXc\n", - "5Frf0sUOO22BhUTf6MatKarbA54SuNAmIi3ejRZKQJ4XCjhpsLBrmw33yy9Nk6OT0LCi0ELysL29\n", - "OvbOK/J+/iRz4bP6v+/3ppYXG9MzSEeggmS96wm6yOsevJy9wrAAAAHWAZ9zakN/AIdXwVSZADwX\n", - "ZeAC6HD/yFRsSkP+ZT/GPlFXimE8PIk5/ho1VfL2NNL2pqViOd6YYnwc7ksNMs5IkNYQ+fdC2XMm\n", - "GpZcBQdS+anJcAkZpOHFxqdIo1pLhI3h3bcsWXXBd+BTXZhbA2JSmhm8EWBGqSBNaO0U3Qcdcea5\n", - "428f3xthr08dSK0oFN+HNErgBuKfL3JZNShDHaW66u0MaG1B/cF2Go8z1F6LGKUAmsy0D/C2CM25\n", - "q38c827dgYTnZjZnTFxlPuxm+JuWvYpOeWyy3J/wjV/USVL+4BKz61/Ccy+EH/JkQUqRmUOtvYei\n", - "XxTdexyug9nI6kyTGc2H3hy0C3uFxKKFKo9PfiwDCQWhQ1+vZIsII4FYexn+pQbkz5kmdlWKB5Lx\n", - "ONpNVggWvIuTYEFI34NTLTOf285YYkebB68ywIJ5f1uX/OXMZ5RxH3gjNZ8mKLNX9suvs06qOt/Q\n", - "e2ZfZ7Orgt/l3O7GLxwWvzugIsO88I1KhpZhgYDdYZ//1lVBcwG/tKVYjF1obqjtyFctY9LPGIag\n", - "318ehZmIvkhW9djj90e+pnWknudbQDv3Os17s3l7qFADdqSGqYyGaSU47a6O12HCRSwmepV1bewA\n", - "AAIrQZt4SahBbJlMCCH//qpVAC8LE+AX+ndLRI9AAL65x3/f4eNbK2tvWi3seP5qm31GHdf4edmk\n", - "0/ZKv9BuxjUGH/qoYxXDUlaWZFHb65x0lomfbckqRBtklU+1LGTmYtvnPAbKnUSAh/jTBATZpFND\n", - "l6V6ofQ5PTBcFjOWwgI6YqalXUkmqnN6g77O4xvodhM7XQWhsA44ADmvatn61wvReF9d9MqoCN9N\n", - "Twpkx2kbbrSoHJrSyqidCsv+e2gnLoWDEdLGn/42++dseweQBj40iKRQ7paDrpDRwTZVjGQJ+52c\n", - "gaUSUp5A/cAn4FgESmp/sZ0NpfD9/7ZAmCbSUfPUar6ndxZ3XG2DXWcNFu473rzFQZNpJnXg/Pfh\n", - "QCQDuu/iX2Vi2NjGs1QVI3BReUxvD8Z/YeLy6w0jDh9dcJGJdKoNjb9Epdy5r0lFeFb9L8AWhdEd\n", - "sGreMPdTiMRlq+JOqjdogseyQTcuDo5iesxIsb0dhY+P9VqSJtTxyPO42dn6TXPZDgt1vROlp+Ic\n", - "VTutbib7FY5U+jSckVQsLzLRwDuIoa+HpEcHjzuwHMaHrKVljgiPeRI3Afdpqx3nHgy0MFCOhGEr\n", - "Jkw+Dadh5qrWjCGOX2K5HPLV0E5qw7krTDhpWX8sTsYsIqvxr/V2EjIFiKwnheBvunmhlbHNUKTl\n", - "ykWRC9Afa8QE+vO8sLJHYNqVh5kOrsn0+NP1Mm4JPbYiahSDJa4o8TJzkXFBAAABAkGflkUVLDv/\n", - "AGBJAvfAgTZO/kHo4lc9yaSVZkgaxkXEQAgySaAqoJy8U1XmJXFaLzsHv4KqZnckX0gP1AYFUr5X\n", - "3Zof5zltHp7OQG87KhkyMuJLOz4diYjf3ctsH2KA3/S29L1hP4qjZ9kfgNEsjrH/nSlX3ikiiFcQ\n", - "/2mu5vwlzQMTIUj5/0pAslvbULpI2rwxcgfjtpeW3qe/Q0sCZXyJ3L7VhEaeyKZo/ALUAi114xdn\n", - "Gao6fyKpZhWohGCsI53i8XO3Y7Dq+aD4ONx4A265BL770fTZiNNw+oM7dwTK1vcPMdOTVjz4fi6j\n", - "bCMBPzMCGM7CsAz7OQTIKiUTlOi8YAAAAakBn7V0Q38AeTG7snd+wR+ioRwfka+slSBm7w4HiigA\n", - "mYoe7RzT8waKJhe/5/xyHdk2lI4Qb6yur2vWdYx/k/gVzZWx+dAAALHLM2W5kE06MD+/WY8W9vMg\n", - "jgsWx+NCob+sUo3r0m3kC7Z6vE5pa/kp8NVK1XizBU/gSaY6/S/NP+nzZeAUHhvnb6LPnQnTmhI7\n", - "+CLAa1UiK6P+lwPbKP0S0Q5RWiopmhls/AKTmwxXB+WRWyrrFglLMCCi/H7yBlZCPn3f1nUi1WXW\n", - "txmtCNftDVTPLfu3fbw+YSszpG0LQoe/d+Hn14JtNEXcVveVKgdRtrJ2SZSzkDZoD5uTokEopKbG\n", - "geSmsxJSe6mDenK/tstnSjFiozTKWgyJb1mTK9iBWStV+uPeceDypkgatRgkwgz17Zgn457UL8xo\n", - "RIb3Rzvhn1PaM6KKHv4wQMqvpqRXKRm+SScKgBhgUzc706tHx+sk3QXrFbfmTj3VwEqpASdMV8SQ\n", - "Rc7Pl7VdiwexHM38nPcgZguGyvH4NF1CZay1mT9d+wee9MfU3VHZJgMp057sUGFJIJZNmQAAASYB\n", - "n7dqQ38Ah1fDGltbSoFNBABy4LNfpqaOuQiA03rsvInHR01iNZMDGQE2sq9jRvjWYcCsjv8TgHDx\n", - "TelM9UgK8aIkbW5xZBO7YH31DMzHB/HcoCKmBUni45/7i/CIo8gF1pGPr0DAA7wV6D09MIgWLTIz\n", - "u2RlgzWHXLOhQSqpesq6gEgghz4eO+szzJWiaji2cgnbFYV7gS1iXMpBIisJc8i3U9gywhFgtGxt\n", - "IPW/7TiYEwGOLwxyjZX1HkROuSI8lAAdZBpungwbYVpPKSngzu3PnOIcBqes7c29MHD8jRPn7Zrt\n", - "720E/jZ4jB2yT62h5AEs+TCYeJmiY6lwGwXm58hIVqeMFafCwAYhd3vDCtfE6mymrvYwtLYQ0YeE\n", - "Ebj2MbA5+zEAAAFwQZu6SahBbJlMFEwR//61KoAWx89GABUe1i4OfaowcQHQyqHCv9PnwkHOB5jh\n", - "ZaY1nqaJvfgMHLxnx0HRU319XsFiIgZ3fycxZ7MoTbod+V6rFy2y2Qtld8RvCt0Ug4PVQuLFLU9x\n", - "N6gbeWntqj92UVkXYHO8rtnoyHbc5vkyDRwK85+1rEknOmV2fCPAJQWJQHZKzqn/akJ6R91HlWya\n", - "u/8GgP8q7KTtX0XyZMALsB3jT/UhmW5AlGIwNHeW1rtDiMG/Xy+69i+m2kTOjww4y5o0/8WfwLLR\n", - "RKlhEE1LYjJQjoy3+hNy7YguxzdtR0GOg0UsPQLFZIBnnCwGmFharg9MSkzKoZck80tBnNzVcu5F\n", - "Ot8W+bdDLv2E/9UTXci1RXlM26z5jearPa/9d/CciU6kElsImbzJ5J2YpzVs+pvW89XbvAJMExZq\n", - "wXD26iUkefzti1p2cc2CbM5qN5CGCTCmR13du1Y9J/JQwXkxhEAAAAFiAZ/ZakN/AHwUpp6Dymc0\n", - "2L536BR5shJlFypABdlGcrzfdaw/6f5GB/atQKmEnLjISTsAvG6zfbdBMs7bm2yeFrIQxXuK81kC\n", - "9pAAAXcBlvswH72knWeKBsU0Ht1g5h3YcKtQv4e82ah693wXobc+mdHgPA3TBKIFWUv/iM+/E90G\n", - "S/NmTeZC+lgt/zT/+HMt/QSFK9C1+AMdH9l6Wmy5eJzA8pumBNuqAArwclv8LW1AC9Ryj7J7dIqZ\n", - "2nhKIYQ08cavMFAGExrDHt7RiTs4Auer+jpijDT1MWhCFcQjNZn9nbOp1MdYUZ3batlHR94YKH39\n", - "SB9iaEe1H+vDrSDRsP3b0PfVLevCUtQQ7tTMju5YxLigI0SkXHby6oMGwH35DOmYdZ/QEHihEbbH\n", - "ljlaWypqm6TR7b/zNBCPoaZiHS0IlbTr/gzMbXxGasP7GssB89XtUV2jZihKJYcij8456L2VAAAC\n", - "WkGb3knhClJlMCCH//6qVQAvW48vGhnpxPcAFRvWsRQfCH0ZQNKlkI/Fmy/VFBZqjdqwlFWyRDRU\n", - "ATa/x8nSCThm/LYIboN0iejGj3Uchm8nyLv3P3+HOOnCw7+XGsyycSpaT/SKI8hu4RwjrdDxqaYn\n", - "k6pZ6qjZtX+IZ04XS8X44piBkZKHHklQnddyez3eJG0JjT0fN5b/c72jAD+sOeXlR6iPKkSUzu0o\n", - "3ha2oHN6UEDmISbP1cbB3piI/SHrisHlFNjIuHiEdkqSzG95tlcEE5RmJMFHyIZtmV+VUnHUg//H\n", - "WOVjyT0+oFlaS4c8th8dtoQJgchjo9u+OPpSDxEJgWI6zeeh28ogNTGzlwRqjfRSsrTItvjA1MD/\n", - "oBFhKLk5Gm5LLSkMpDHu9T5I2IaoH3PKDFRJp5FswrHAqK+C6EMiKJRw3UfQ++e71IzTL0xpDNJL\n", - "z6AeitOHT7WHH1q0lcaxtRKIXyzlri2FOeAU+zEh7DbcM3wvbzCPYrbD4ePmP1flYALif0DM+F20\n", - "woqO1ciEp6KvfcdLwkVhOi6HukmunTXGsruYaqjkaLT2QlUIMJVPTAaXGvEAsJSG/0vfsDXKkk6Z\n", - "sB3ElNrSO3yHej1aIEgW5xnCNisEQsWn6TKnOYGilPN4ZN8EB64V0F8PWNB9Aq0baX+T8kKesmFw\n", - "2y/668NRP8ypn4s+0TEew3V5nLH+An+XxWolypflMoVnWhEhG2W+IIgxfWfPuSgDmqBKtSemnfnO\n", - "mj2z1HJ4yEmqNoBjJwYnWfK8e0PHHb381Mk1zGGJOgWAAAABUEGf/EU0TDv/AFlVerlP4Rak+BQA\n", - "rfH1MAekqKZtO9rI3YpPu0XbIusXd4D2mikBBjNWCs5ZCx1/nIkAW78LpHSyCScRX686DgqeELvg\n", - "+6gjEvz9oPv/Q5SyPMBeMNrb/QJ3ato+Qw19nLJWjl0bduh+HilMsrklIYKHCWBaC/dNC4s7Xl/r\n", - "RCzM7ZJuRKmUY/D5sEAdr/H6TIVmiD0u2jiehC8y8Gw6flB5fdlWyz5ArpMes88RS9cHH1n4Dp5A\n", - "9YiKoxa6XsjMVtwy/Q1CE1CcjEE8nX1x2wi3FF+AiuFwqQsSRlHtfUsVksDBdXLvE8zjbyOIuIMV\n", - "pnJU22cEHHqRAVAAAQz/a8I3JUwtCYefKDlHQuITIdlhxtkj1S9/MOKY0At1R1tnioLMWN7HUVCo\n", - "b6XS9uoGwS6oOJgKcTFbR1vNa4wchWq0XCPds0DBwQAAAPYBnht0Q38AeTSjvudgsbkOLNHOwJSE\n", - "7MIAOT4Tae/DlzyAOhFcKHSt+XmND2K3krM1WAe1ksxoXOx8R5ib25iI4yoXHAvjcPvcDoLvQIYy\n", - "rfzkEj8FCsgVqTty2M7mcrrsvBMmGI/tSEAq1Wpq/wSUg2I4oZj0GjiChzewD+uw3YnWAi/Ntf5Y\n", - "Cv2dU9qEo9e3jPCavhxnj6HVQyqcvxekJ6cEcAGQvRh8PwiQyys4LYMz+Th6jmnZO6zDQlY1h459\n", - "aXiX/1NPDVjhvbOibPxdXy1nW8ZFN/ZpmMtUtTAz4mvuGfLCJYTZv8r0n1cztBPRieehovEAAAGy\n", - "AZ4dakN/AHwTrqiSAEDVZr7cfUIfCi6SEtf6z4BBmn/qEvCbGFYoG0hJzipIIEfgPxGLOPb5hgYo\n", - "3EqlxYfhyi3ADlPB0rSvUe/2K1c1bOHHkBdbN7v2fRCe6cTgBUViIyBzKbW8+YVzs1NjLsftvDLF\n", - "Jws+AVbFUOsz2XZO6+tJqS4okplORVfI8Zh8pjE7ly6+HI7Omo301kEp6VZks8VHiVKJOuTRsuFe\n", - "1lak9cDIgZS7IV3MkEjdmu8V6wPVTOui5KhgRegdKpe7dvKwiZROacSHUyEpgoiQ49NAkgd9ICSC\n", - "nOG96XtcVUK5qLGXI1ECEXtJcuaFVMtCmmOBBiFL8jC1MpHbxQ+4k2qRSUjP3JvFi0NfrsxeXbrH\n", - "Ebg5vBmNpJE6T+wdC73c70xC+Mtp+wYFzu5kfTKcL8d+Nzu4GlIr338e6SWwNSpXRGjfdLp9o3Ic\n", - "2PzMtQmrlpbEeUDp1vnkaZoqSF5M9xanIk/zohgoPX5++NN/ebYvr56WROjUeIUdsOf6nrJlmboT\n", - "DZEat6r4aY15lVCgiz4Mpb/mqSazxzrszmdRYRxGsW8DnzAAAAHfQZoCSahBaJlMCHf//qmWALFy\n", - "5oM61QiAB+cxK4+jNCOHXw6RALujtnWF0llKsvjvaSIz+44BdTBn8Dqmduydu0Ab2yYLL8rBa9BR\n", - "bM/WBrO6FCt4pfpaT57HiAbORTevnWHgnUCdwsiqbddvhjkiuJYbgCMD0kEP1SURu/b2Z5hWsq5s\n", - "eIdJwlVUmffx/GFsHH2OVg2kldaudIzyWEsMXsnZccvZ4+1TTMECSDKdUtlhUW9AAgPUraaePKP1\n", - "hatMAsKbsEP5g1nzjTlmyHjs7FjRbwjKng4/qsqVQ+s9Z8Le9mq44VPerxrlkKxdRgf8PQXTEpxP\n", - "gMR8UP9I/vRSJBbzTafYsMhPytfC8ESUe9ySga0pNZKSvC+bN1h7zO9OEjqF3rsnXJU2SZN7NAbS\n", - "01WCPkWQIdWN39TZ8BwhuM2E1/XfXA9OxCI/7PAG40Z8M1rKVJPTY+iwZnIQA6cEF3rnJVasn/JZ\n", - "rircnzzi1JQr5NiwthCEkD02k7GAoyHtF8lIKArvw+GqH7Ox1Tpd6DhPPJm2hmyijeFH6E+9UCJk\n", - "Iiolc9K3UW1rmUlHlF/p9jHAvsiiJUpuG/KCfna2LEYj9yn6P2oNlWfqq5P2HNtctaJeVRZv9Qb/\n", - "mNVjyjAAAAErQZ4gRREsO/8AZUEtk8LzOoS4AAhIFC88oI10PfUAs3UxxCOOtSzHREgn4/jgVfHt\n", - "0r483Tf2Y8D+zGlycQw2lUV6Nidlo0k0sASUCm4dEwF8Hb0+IzseFE0dYexJdLqvhcI7IIUIH6RG\n", - "uv8cjTXFD8CTksvYGpGc+uBYXhlwc3/jHhNGtm8G24uHniey+Zy/NtEpSl5dub3bE324kx+/N1gF\n", - "sU/CxkQF6UQWvd6Br4nL+i2L6udCLqM/JAVJhScc01UR/bE+NX2i3upx0qofgxfWL8unNZ/BP9Vc\n", - "CvVXAtxPw+0JopAnWMlwtBFG9wd+oP4zOIJ88u/VEvyZQd0JJP1Y3qhYk13Deyiv0C1r6ci1z7CQ\n", - "UwYqgUT64pT/hlIvHeCzEZxqH+WbUbEAAAGYAZ5fdEN/AIteE+hbrZmAAHNd3/IVGxTYP4E6C+Wr\n", - "63le3xAHjzqOqEil1tIAAUY3LvF62/277H30QskV8sEjceHvPe7bE0mfZ44avBY2gS0AAAMByRDk\n", - "EKOyh31Y2H0mdsy+zcGsPrGm3pHtO2riBcgILxHO0F5398HG90hK8UgtDUfp9CQyPOvDSyEU4WTb\n", - "6/WT9Z3aca6tb4C53W6p8Geyjq/mwbvNpnCVbbqIcx1ZT2+dencovmeYmPlI7jrhk6KwLYEd+5gO\n", - "J2YeKk4iWai6BsaO9+Tb5P52jBVHcSZ+Vws5QhTxkBSpdHlWJRcbh50V4ViVltwUN//XNx+jx2bk\n", - "KsfglI41FGmS2xAJtr8ZhKDk1VRRL2tGsNB5nztuRXCFd8q4MIuVVWGjim0ntcxZ/R18mzJZN+sI\n", - "qKUvfsxoaeZp+oIaU1hLeXzgcHEe+3/6emdZeJWoDNhUqhkfWzWzVZbEzUKpDBS9AbVIA5KR27LD\n", - "3HEfRMw9yt8eYILg7m/Rm2ubtU8u6V2QuxVXq1OHry5oY2TAAAABvQGeQWpDfwCPV5unds/RGF4o\n", - "aWlq+XwTSVpG+igacFOApaqyNJIXSXT4q7gA4DkP0YAYAumNCN0MwD7HSEeIsv3Q3L9kZ2RagxvU\n", - "jle4yQq6Zl5W7AgdlZnaBngH/w8xYsqWx5t90zzi7s9VyRY9jaNshfxuJAZcRgFILNTmQNCPoCtl\n", - "wyo5Ht91VCy2qSby6JDLeTD096PzM4KOK7/I+amuefuT0S/QnDNs952oi11JV2mbadqtKDqJE9x4\n", - "nX/OjU9PBP1uhsFLNkjsz6ZHlTOcsZvWUxabbw0HBNFuLXWIYqtAYdWN7c/QUoqY2IlVBR//v+NN\n", - "Bxf/rxPv+9QlTTeUOAVhzyU/kQACorW+VEL2KFNUPF85LUxlbSGEYQv/98/fAQAu6hKRw3yoJoPy\n", - "tyr7S7Za9gGurMYseuvuasNoB+fPCmp37VWgm4yNZQ0LM+8CPtaQgShVMs2/RIG2cXksHuYVqEB7\n", - "PJtzP2tl8EYDen8RohIb2UO5d/Xdc8aoi/Nu4IzGq8ApuZIxjC5J9bUYtMDEDA6eChGKPjb20vqg\n", - "2PRBI2fSXJrcSROGTC4m+VsF+VagO1LnjrakndEAAAHtQZpDSahBbJlMCG///qeEAVH55ayIAL6z\n", - "9D9Go2JR/VsPgULYIy+HM1JNQWUio64eqKV59gHDbxQ77xKGvVi/RlMeepNHF+Cplpp4rKqgivaK\n", - "14o0jVVjKwdzXmYfm8QJck76NrSj9rXzMi3Th9DbQ5HQHvlFr1+Ft6fGVXaubVoF+Bx3J4nvsWO+\n", - "FhXDphKaWh9geM/3PqX1TK4zqhRL2wKgDCWdLvIi2s2e48RSWR1zksj0SjkMINJfgjA7wVj0dW8Z\n", - "NZGlcRPjgkoSgpomI+x9/l7dJ5fHEj4WOkMQMTJnj+KOqaXfgtXbhBachZ0Av1Z6rh+qw/iObJOy\n", - "7q2gUdlftEWI7In7KZjqqg18Bg+z35wI2FmknOyXdEiDAPaFiRrhqkKOLfgLssw1BdohiuTGWlKn\n", - "NvPL4EzIbAUeS+0qv5cFdXvRjnn1zOMYTMpyN1CZYg4pqjj8mGtGdm1F7w0Xo4Mnm3hRmvZyyOaW\n", - "yf38s1SCwyOkhQcwJhrAAebvkxMWrAUWrTq9K9PdCUqFbMVB9+93aovoux8zBfM/WLangtLLXd/D\n", - "T9TcgY0eosWGZeAhQk2sxNC3bgvMT328AT2T2XCg2nG4jsOakPWfscwbc0zKfItj/1eXvyR2tk+K\n", - "fpgdg9dJ/OdcXINTUAAAB95tb292AAAAbG12aGQAAAAAAAAAAAAAAAAAAAPoAAAnEAABAAABAAAA\n", - "AAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAA\n", - "AAAAAAAAAAAAAAAAAAACAAAHCHRyYWsAAABcdGtoZAAAAAMAAAAAAAAAAAAAAAEAAAAAAAAnEAAA\n", - "AAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAEAAAAABsAAAASAA\n", - "AAAAACRlZHRzAAAAHGVsc3QAAAAAAAAAAQAAJxAAAAgAAAEAAAAABoBtZGlhAAAAIG1kaGQAAAAA\n", - "AAAAAAAAAAAAACgAAAGQAFXEAAAAAAAtaGRscgAAAAAAAAAAdmlkZQAAAAAAAAAAAAAAAFZpZGVv\n", - "SGFuZGxlcgAAAAYrbWluZgAAABR2bWhkAAAAAQAAAAAAAAAAAAAAJGRpbmYAAAAcZHJlZgAAAAAA\n", - "AAABAAAADHVybCAAAAABAAAF63N0YmwAAACzc3RzZAAAAAAAAAABAAAAo2F2YzEAAAAAAAAAAQAA\n", - "AAAAAAAAAAAAAAAAAAABsAEgAEgAAABIAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n", - "AAAAAAAAAAAY//8AAAAxYXZjQwFkABX/4QAYZ2QAFazZQbCWhAAAAwAEAAADAFA8WLZYAQAGaOvj\n", - "yyLAAAAAHHV1aWRraEDyXyRPxbo5pRvPAyPzAAAAAAAAABhzdHRzAAAAAAAAAAEAAABkAAAEAAAA\n", - "ABRzdHNzAAAAAAAAAAEAAAABAAADMGN0dHMAAAAAAAAAZAAAAAEAAAgAAAAAAQAAFAAAAAABAAAI\n", - "AAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQA\n", - "AAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAA\n", - "AAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAAAQAACAAAAAABAAAAAAAA\n", - "AAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAMAAAAAAEAAAQAAAAA\n", + "cG1pbj0wIHFwbWF4PTY5IHFwc3RlcD00IGlwX3JhdGlvPTEuNDAgYXE9MToxLjAwAIAAAAQZZYiE\n", + "ABH//veIHzLLafk613IR560urR9Q7kZxXqS9/iAAAAMAFpyZQ/thx05aw0AAQoAAjZrf0Z7SQAFS\n", + "RBmrGveunhOj4JFso/zYXaRjQ18w/5BhxFIRpIkBeRXl9T8OOtGMbM52JtIMXIY7KRr49/IsKi0w\n", + "jJUK8Z7XIFmlAjIU+jSbWER5LmeK+6/diSLijDB3co/ebDgChTdnt/smJJAlFMJhzTUcdwoA8NQo\n", + "YBnpXwCtHd9MDNyz4x4zrqfgfXAXtVDOuKqK+ZIROmkudESU5HAc84NxG9mIFkHTHpfRFX0vfuvN\n", + "v30XneTe8IilYhOJYkyOcVBz9L5D3N5P2RHbPf8d2Ia4qkwGurGLJl8PxjFsKE4dm+f6WYtxh4/M\n", + "EbibuuIVHuFVTrhDBdjGsnlvGJ613cHSu4frv4bqhIfOz9nOKI/zhLw9zlvfAkAek0G+jTz8be7+\n", + "o/ndntGdno6L1LXJpdgGJYFOyZwDpk3suJqu9FKdCFsjDfQ4s5OYpZkBRm/h6ksvqs/jKOI7H7Eu\n", + "JEDtMn0Px1875SS+KLSHaHwtTCNzTTTEE83rjSnRcLH2qekoCAzC/F7u+tWoo8/5q7AU8ZwbFyde\n", + "C0AcLGLOTLX2dctD5sMzDYlYtX/lYiEND4SUALBVfbetB5IH67pM/22hp7cM4zkyUfekvXZeKUpq\n", + "ihxpjZ/b0GfRGel+eaIkRAMer8l0HHBl4xOpdwEUiGEQqacmsmAKA7/Wn0I4FZAkAeHbrP6JQw8G\n", + "T6oLn8jHc2YBwe6YY+t5SuugRFwnijdFTQ2IYMHZ9spzZjJhn/lftFm13UY9ay8CDty2j8dXZfss\n", + "pdN3RSB6EMFrirN6yUkoxa8UPGBKHs9MUFO5MnKDgADHT4JhBGInxUASlDV0lsFB0GH9ED4tkRc6\n", + "7SnaMmZwf9T2i4a1NSsheM+jHEQWr9fgPDBABuIyToLYLrnVeLXqSC8JMeZigh4GOpQKyiIsG8oa\n", + "f6kiBTwG/5RebTqU6O7rrQLj5Wd5YFdqaacUZGByo8AxJ60NHIoQcxeNjsWAj6m8SKd2+g3en70+\n", + "zVQW9HkvHI7nnRF3FhwhZYu/LvproEPyWSYykJIx75ojR14WE7oWSjYs0X2AFiwEouayVGii6owJ\n", + "gdlCmnN8HoqT5PPnaOWG7mPgq/3meUuz982ZX4+4VMage3Fe0K3cqRdKLTge+gs4pyQbSUIdrgo3\n", + "4P4R1ejF0wAW1R8YjLZz6fQUzzzchgNN0t7aa8tlO2yDCmII5BbaYJXJrRvBm8Lb1m7TLILNalgu\n", + "RMjYD4Pf/P4iQqWsBEdgB3p334RMzrBfcviq+49N2SRQlYxV0SbSMdybZaH+vxuw+VyvLt3ulEcF\n", + "rmBwnxL4kpGATPv8mogAAAMAUMEAAAI7QZokbEEf/rUqgAYz+kaAoYS6oZnCZBWChU49QzRvBVh/\n", + "3Pl1tY/3h6ui3wW2qKCfpdwQ1h/uuKhRazpong7+Xsbw5g3mv3E7I0N68sUiey8Dbt0hMUrR6zYj\n", + "YtzMQ7gEdgcbbOEgu3H73w44JvEzvgZ4iO4Q2Kwp7BHY2uxxtdUENoG1kHXqnnQawFSCHZ9W6pRZ\n", + "ZX580jW/ekv7tzX5SLrr2mknIiIEL/9OqO/hdKRWyIS92L0VbeMgboQPIpdXZEemH8ScfWR641oo\n", + "Kb2ZqixayrynX4qeQdDAXvtKdnTPfgTsOJHs6zrnaaKb6SpoCg9ffzFUfiQ1YwLPZpLhwkJ1F58m\n", + "QtliSU1LCArOxcL0CdX1xv0PO1XbIga8mvD2ON78HrYIlpd7r9MIJUgGiGlRxLTUITjvxtxjLYBG\n", + "TBzSQ2Mqy08Y4xvBh9/AZrWGoBvplKVOooBAXsS/J3OngcAaMApnGniTlEgacIB/4ihqQm9Zync1\n", + "WrLEldONGr9K6gbteZcFnK/hoe6B53agN6YwjF+Hm1IYltzK42eiNQbmeo0nT6xx724Sek57Pcpp\n", + "/+64lZEYNhMLw61j8cLCmWJLqJ9+OlV3Tu4kvqWM5A7mBmXunK5EElFvFoiaHvfKnFzVKUZHVN47\n", + "dwwOu2bQK/GEFcs57H1A4Ddl2JAlJt4ZWgrJx+vzAgyhhcl1LtQgQcd3rX3aPisDf1CYETnay05i\n", + "xe8yUL0AVMzI07+lqERP6auGU//nlrslfAAAAS1BnkJ4h38AGAsZbANezx+IWo4Ni9MoMfKTC08P\n", + "cqaDTyueeuPLGgFgW9U33kZ+Bw1xhP+VnfaIAfTxYvkb1WNMMRMsh5PjwSMCmaFIlQvFeKZwdgkf\n", + "0eHuoCcg/XQXRqCvEyyYU7Kr945fY16Tu/18Zd8NU8RAJRLFspmBVoIPZ/aTPmSIXDq8KOSzL6TG\n", + "sWN+V8RKxGwExIfHZpdEvHu1tOeg+pVzKTracfnYiBxxlkuVIyzOz2mFv1LQ72jZQGocAdWS14tD\n", + "EtCsmNljiTGQDRggnoajq8kpnFHws9ZMWmcsV4dQvczexFmx4YibNvvMPauj3CH/KK6FXvQFumid\n", + "ftiga3Uno6si2epmOuEVTuVQwXsgCmOyejpjAiAjZuUS1zq40WginD1EPNgRAAAAXQGeYXRDfwAh\n", + "r6zZu6OyBrfB5mVsAz3QNRRqvrwAcnFznD7NXanOaWlAADNOwlJX/xGmO79sH9XeNRT/FnLuEPBH\n", + "1GJhJV/Xt2R0YziQPpgXV9BLMr5IaMaU9R2CpgAAAPgBnmNqQ38AHhCAmS1kGlkSnBkADoOXdXaF\n", + "NGZr+Q4fCvQ7bHDsrrZk+gghfDnB3EgAw+hgyCz7QjPCBdm4Oua2VioU2d4nUZ+UABLNnRNNghIa\n", + "znH4EU6++iAxhcURNicOGGgil2sQO5YirsL6J7S/TznXYcILcn91E9qrSkdqAKeiqMttbt/NlBlt\n", + "zFtTLIQV87eeTgQtRSaGjNkYcjtT9zsSroMxdQkaS/rgzWfPKqioru5///iiFvV7FHhGNapsB8Ep\n", + "xA6YqLEIyfxd3iBKiJ3g/96H/WMQrMVl8ykLYh6g9L/mEknpMxDRuX+/d5vuR5TJpN2l4QAAAY9B\n", + "mmdJqEFomUwII//+tSqABipnkgGrJGhoF2xhqIGFJgrTiV28TOHP6iMSZwA4LzauSvgcy42/qpKz\n", + "PF+GKWIn2EJeWsQWOqhnFWAeu8Qy08RHEYzw2BIfhXKPnsvQ1D45gRUsCZjYq85tliORVeVqHlvt\n", + "fzWrMqI5f+favhs74Q/1bo2ebSMVUSFuP3HPqFVDjXrf/wjJSgWTFPNzCZtjDghfnhYgAzPVh4sd\n", + "mfpnfQi7UGcAu+X0SPRW+sCzjBKyZsabYXRLvCvcRgXcWHRJnqJZ7DbIL5Ahmra4MUmiAdrDqxi1\n", + "yixz8Ge2MnwDKePhHbASj9FgVyabApZmODkYAk9x2eNsu3NC/GWuEsOYUEJXb3NkJ3H0Ehpogb5q\n", + "/7IADF2Rk2r94PZTFE6TdqRa+DeKrhf1PoBJxN2bNx2sA7Pci476Sn+ZpPsAPTlXaikJNRAhO4tD\n", + "lakPd29Edmfvk34bCqY6rFMuCfUJ3yzCy+VRKB59CtgS68dVzaJO/FxZ2Of18yjXsScM2fL16/kA\n", + "AADDQZ6FRREsN/8AHa60qBaQmR4IRAA6Dl3Sc6VtGJbtr5vbN23f25BY5Mbt9ZodJaqeGLgSZDt5\n", + "tMt3+exLq/o1or+DyDOaUjfDuI6HO9EMKVIFrK5bBNySwYGQ9ZOLXviohcSZAskgQCT8YbljWqgY\n", + "W5O+m+Ip3OoA9JMxAp4EiGRPR1hmuQDeRomyGX7bvvzp+lmhQcgx50Gtf2FsWph71RE5OIfz3vbU\n", + "YPJzvstNoHMLjQVN28uexbTk/wUswGjCQ8u5AAABFwGepmpDfwAhvaAbJNR/9ddNI1ZNZPr5vm6q\n", + "XTetXH7Eo8GqFltKJbOb+WxFxg1OZ9LY7Pm4G1n+FvJzAc9iMK3kbM6geeeFIdRl75A0UZYsXIff\n", + "dQXiQxB/kP/GUeJS/ghHdsFXhovY2ei0jBYXhl7XCQdiM+OxqVpdBNYdLY+vhvtTydDweWAQhmfY\n", + "3fYN3w2o0+YtvleCAQNIu+tN7OfSeOifT7EOLQk4YDYkvT1QcI6scYDf1en6ihiP1DSq11Clzx8a\n", + "ja6cddGuoMqDaNkxCF1dzf2Jvz1VA4BpWPjukcCUvSBL5Hjn5IenmZHNevhC9Ri5TKMMAK1OUZos\n", + "eUJttkHLI36Z4EqqgVQeXc7fMR78LG9GpQAAATJBmqlJqEFsmUwUTBH//rUqgAcd7WUAG1wL+eMP\n", + "5NbNjI1PanDtCkQqkSzemsYEjSdqyjDQBhMRhcVkBjrLnQ37QRY6anUo9HtaOXKEvV3Oq3t3zJnU\n", + "VnRnO4+DsYDha+hVjf2RQfz8iIHBAMZBzDCidKRjdK++FyTTJT//wjjoyDzrLD81EvvOEfP1hNq1\n", + "E7Mf/LNi4VzZp3xaz5k3oYD4Uh8itElOoUglEcP1/ghF2UcJA9hOtkSUpVhA8+T8Ytc1zpVMfYyg\n", + "QqbyRa4EvI2+PCgNWtypZmPOW/fUb8LPNYTg5GLhzbOmSjYpenEUzkib0QksNLKbj/E9aHrV1qHX\n", + "qXiny+3UUPxYGvj/pDuYRozh1EchMNkv/eHEkrQhTQjnyxDirLtyAwkvICbz8w9UK2AAAAC1AZ7I\n", + "akN/ACK9oCBuM4cceanCEEWpV8cuy27lpLcHp0RFJ/onjSEljOG8VqS2Rkf30kIRre+KMlNGVcvp\n", + "cL4orO6Yp5KjC/RRBwQz/yE8UKLNeO0Y0FFhQfICXcBtO9ndieTXXlspFHuGf4S6CeBKlAO/lDFn\n", + "Bm6rf4RqP1vvLrD8KUBlig+AFH77l/U3BNsHxmcjURJ4rz9SBUp3dWhkBmKNCP57UtC9bKnqFyE+\n", + "YvACZ+sMCAAAAZlBms1J4QpSZTAgj//+tSqAClE1egBKEwbZY3t792fWy96pbeQQCnoXHta8keYB\n", + "6YD4iyrisk5RAGXAP8hftXkqsIp3gIADtqeyulunIxMvA+tHyMYI4mH7Ktx24JQCDLGwr+SW5Lfl\n", + "LFzLN5Z5EpfMBtjuN1e5MGJfkKE7RLofReD1fgshPg5Hiu3eNzKNtXPqCUQOQrANHyjLVDHW1On8\n", + "GbpMg//3+EW5h//MyUrV8C3bm65GCPAdr+IiAQS5PLqRpJaqPFXYImLzCfEF4IcxGqfKzcnaOGUe\n", + "P5zhUa+at6SYruNLfSBlr3+mvyhAAxPUBpQBX3a2ZIbz3QLaxiA/KmUnrCDmuWAQmEAoRWFYDkhB\n", + "vSu304LzlIj5BSPPqNvyTdiIsLpzAu+SwxleN8rOU8p84R24aRhgQwchoF64pWQkYvhDlixS1XkC\n", + "+1BFsz/ugThqWNrj6DMWcUAmd8tN3JWA8raGQmJpBH1Zjd5483GFE2+DssYAdvIzFktdYvwqJy33\n", + "xqAAiKb/jZmChnRmwaKmyp+usNPBAAAA+UGe60U0TDv/ABgTM0cFpiU9S5COo+Eq1a5EDpKRq+6p\n", + "lSs4dhBzMdhHGYju3Syu9sir+n5TA4S4EozXRjp4djOH9s6Ebl4mnuRqUkAVVyRRxloLXXdAVwvm\n", + "Kw2kt3nH3KtGiXPZtoKRlLMwsYrakek54VGjJMSSK7z2j4bZfzdU5fWILhtGELYhukSGMv6CXtq0\n", + "ugZLCx24z5CJjXHZ6aJugoOXVvLE5AMKcYDe/LowGji7OLeFgeB849mfSaUGlnh7jxuhBOU+fRS4\n", + "p0ITI4vXzUUR4XVTQrOXBNie8HQwoivm+WRv0nW15Zl5mZ7wAnqm6XldppA1IAAAAMIBnwp0Q38A\n", + "Ir2gIG4zgb64sxYLzhi9P+r7lwy6Wa7RRkAjTYM9mY6ueOaRzgw6T2RlVKQ/Wnw9OUPsoB+98v3K\n", + "7Ai/8Ku9oiX4fIaC4XxFxl+0lQDznNsd4UfPo3AQh6FoBHug176P/7mBbtXW9HioX3mZhTRXJOlh\n", + "Psk7HP1i1klJ4f63KMPuZvFOjkq75Z+u+/aiOQvmn6+lP0r2vSaqs7nxNSGwPqSwNXaUgQz58aD0\n", + "pB2v6eKf+Yy3eGu8f7HHrAAAANkBnwxqQ38AH77opN4Quy1TZxAAOg5d0nOlbRa1oa+CUrbGUKO9\n", + "s1K1K60LxAZlk8ZQWiHU0UUuQDnHAAyjelIcwOj4NipQdTlRBT+HrLVCVEK5smCT4WEyhlST21vf\n", + "pS9QIx6rrJJt1ZwRk3fLMy3lh+GbSU8p/deKiRgvPKu2y5xljT8HokdUfoJBN0b+9AYNdPwZxzfv\n", + "wRj3rjB+XbCQdH7rLOmVBWtc7YBBcmnLfJ50Xx9vsPrIGyT/orCu88gDS7Q97WNMWaRoINuEV0SN\n", + "7lASQ8YC8xeRAAAByEGbEUmoQWiZTAgj//61KoAGg+KazAhO48Rk+mELCfGa3jedcL7j4gDd4k3m\n", + "hfDQA786lCeWa51/s1J2qe/kkvnBjg4L/5tqqnPuWzD5CtqsuCrBZfD9tieYn0V6h2QRjHTgf2S7\n", + "KbBJVduRkgXz0DCyLCsDRdQx7ZVeilFNQPYHPpL3dFbV2ZQLhZ15DCVv0ijUbfdtbaCxQWk4hFwi\n", + "4Cl7Vcv5eumMKNjbBf29eX+p4vfxRMeLxQVGLH+o2FLpf2SZwh6nFX8ReHwFB2aNAZojees14KLO\n", + "dDXVOKLwRfawG/F4iTHLNjIHr9KJ7RMP+ZW2v4UodTEwj2IkfoeugjPYygxsYBEN/HIWo7Lp4BiH\n", + "W+sGNW6nzMrLHeZnfPrIXJzjKMZ2dMe3r2TPoxLKTVgPHlFgXbB9gOVEkvjr1YtxEt3sHivjr7TH\n", + "zrmzrXSS01xk914HSqt/CnYSKPxa2MF69g9I/BNJSHdHCdNGwRVm5U4w/DYDySkJOTHhPK5xLTdI\n", + "6pomON2J7Snu3IFO1cMuZQAgHAwoynkWURtTVoyQbA1o0XW4HcVte0xmLSUrxW27KPhiReLpDIah\n", + "P07+6UwIug2Iw2yxWwAAAP1Bny9FESw7/wAZUxOT3tiejYgyJDRrCYHaMUHhX+buBbaoqZ/1iUWs\n", + "Jb7slI/imiQ6OnWj09SEskbfc/zlMQQ4SNXZauWfHJ95XYh7wMFGgh1p51IG9qMewyJwQS444Zn2\n", + "viLgUg5+yrpXHCf0t8/9jDlbqwjDulbT62pdxpAyxuynsO8RFT3dUKeSE5htp/jbraDowEdpXZyE\n", + "hG0WYkl+RbztI/PQNZCwZsz+nvpxvKr5XHM1hBpXHcYTolc3yg25EknXG5iovx0Y9EuSqthrt+Xw\n", + "mK43mYVJUVC/Oh8GeZYMuS8/kSjScKjb9J2cbfyAxgmK23G/LX345QQtAAAA2AGfTnRDfwAc/TTk\n", + "s3FNYSmNHdPgDfXQC1GBEwJGCqSU6MsmeFhDrrArJ4DXkS7h5Olwl5LsAdAjNSMWnsyuwfwlhiS4\n", + "Iu9nXiMR2gsFQTdJfxAGWv/oGKrfOpY9OM+oH5mmAEYRbo0uYIZjYyyv9H1tg0RX725ktocEeT9I\n", + "3B3Tp4qYCOAxN7JPiw1LGqnL098ntFu5ng1+yPoA7ayjGtnhqUNzDdxHw06qdCQZykRFXaAS2mFv\n", + "lmomA2wH7gnlU4hH+9/QtYxMog0PKOypGE94HJSUfoT7gAAAAEEBn1BqQ38AHE7WHA5VnN1RP/m4\n", + "B17wBGTsyVXKs9N7WlI9AxsJJ7v9zVkMjf6pvv+Cg6JoQ3BLOK7r3bcONYUtZQAAAddBm1VJqEFs\n", + "mUwII//+tSqABlJow5npTNmtYD16z8AGI7v0s/GnfyqOWKggEMwd90EmHsgCWksYKFE4Qru8Yv50\n", + "LqOKJvWMLHGzKIf1mWoops1hD8q4hCLJMEdRItKEcO/AvOw75DCgogAQMHz94YdBlV1FB7/3PGw/\n", + "kvp11c7Zd3bjgbTV5f9wCrj5V98Wrk1QkXKTao3xn1WeAORpyCtFJo3KIIzvry0ktsvXmShsZdHK\n", + "SF2Q6qY6Id0i1QRrrPRdF2iq2m2rhv1eY7FLgTuR+kimJsshiQFr/qQ4tOO2msQRBI4huY4JSA+L\n", + "KftHgweMeBwJfCg9ocoILqar/ZxuCC1Kx59hrQRJPfm8amRIkwU/k+wKJNYh9fLLSBsxlrg4XoMn\n", + "PzXBXS36HS/Vq/PUU0Saj0Ks8oGCHCVcz3eoIxgiU+QJY/DixHlF4+MYR1JrL+dYLi5XU6rOa8uy\n", + "cymZbC8fCrT8nFmCuYcD3DNSzmKt2Ypk8ahqcNxMHCCE377w4QcAAK8hLicCDiuo9KVio6ugqDQM\n", + "DiWya9QmBn0ClIbSCznyVdfSZyODo1gjrJ9IiCMcnWI45hcgB0F/w3f4fUDX3TFD/vbMoTmxwMKV\n", + "hWEq4XvI4IEAAAE5QZ9zRRUsO/8AFKVUcHl/E43Gt6o4RZvBs+iAp/X/n7d7Pz7RdmO0J7CPEDVr\n", + "YOGCwg4aa5sRnK1DwPx5sIYzP38566ezpK1+yb8tpnK38Otysb+fPORXq89pSQ+5zLmadq08PRPq\n", + "ft5b+CuHdsaohxgMdfr5HBiNNodd0VK8TNpXmgIXzYR5RpK7ScM1kMS9Nv/EnJHMV/HrvGwgTDTj\n", + "k64XWbP6seQRZKb98opQD+okWzwHsAFj5ehr/ekl0IlB4NOOkEs2vqjJoc0vIcwkba8FSFkLe2wm\n", + "HNG8c/q9E5Tipy3avrHlLTvT0bjPkjeD4HLfC3isImW2RvjzyyF2TiLuxINvE8y7u04RbyNnhNhC\n", + "J15BQDsVja0XtFDfnnr/h18foOkLRpLJ1yQTMBboYsOrVzSZ9GDWwAAAAM0Bn5J0Q38AHQXz6rvN\n", + "uarixND043ZCNdAAIHUCWbOjp5TUpZdEciERk/s2Hj36k/1QHuy5AO7bU6FcTtkwLNXpp4kEhhr2\n", + "pj14tuqcy7uq8XfveV+qzHFw516IWJuk3fnleTKVnyg4EmdGVkh8uUm8KAFIin8/UzurGkP5FXB1\n", + "JS0uIqtx2mbD94hCpeHMsXHXmWbW3GUD6bwQzUCwUdgGFWWOBIzHIH3jzzxIIZ0rnTzx6fd8zSRM\n", + "hMrhmhy9AElVESMBSl9RUVwHxFBAAAABSgGflGpDfwAhvaB1qIOto5yaJpOYSSkbksLCkPuZStd4\n", + "LeT7CV/DcB+jLm/y8AhlFfeod4crFEXxelJR/fWiWC5cEAQJB3xoICKkbqYOm6EmFwfhOJrnHL3F\n", + "i7egoJ4YJywxTcfWExKLj/7q5Qta5s9pQnji3v49xEhquy1bNbsP/0r8degDcM/eCvveCCuWJP4W\n", + "kmgZOsTL6w2RcANA9FiGFsZYFgwwIJNSoi5uPhHUWhw8DgpZUJJwhbcwAlrJ/XkpDgMQdv8+KTaK\n", + "5RNrXWUI+DQboZuQqh0EP6Ucm1iy8BiBubHVtPfvfM6aTMlQH2sGDo7kxk+QnIaS5zzgTFrv32D9\n", + "yKVtBoqoPJ0AuZgM4FsUTuUjy7Mb8fU+FNoSPESiOFS3CYbvMWBzWtiplx16c8G+2sTGiL+yia5h\n", + "U5UjqF9tl+DCrXkPmQAAAhVBm5lJqEFsmUwII//+tSqABlvipo+ln6jP3YEZZAIeN2gdAdBG93Am\n", + "88+PBAP+pBG1b08i0fIFrYTfZkz4SYTuxIQ1JlthBpef+blJppNwqif1piWVs/t6bCj9Z+mNxSeq\n", + "fY1/wgLfvSZhz+cH951YQ+3lZMxDj+AnlpOYgaA5ONYw7fbC4eXvAp07e1QLTwt7AKsxs6j/dp/S\n", + "ROqifCEiS8aS31tyrNd0WUbq8QssOlpj1+9+m64Uuc7+f7EFYNlp0SQRRU2ux+5kBFuUthOQf/99\n", + "ODAIvGEvExgFy7U9xycg96i+XWorpOkUsmc8UuZbMVhIEf4MYVuxmTzjhiOVDlxwcksj2gNb3xa2\n", + "pmXlh1zp/jlUP6lnJbCcR5jJhGaBJ/wuH3P+rOiJDpAwjSIE4agxxO9XGnmQRqhYjiBkbby/Qs/C\n", + "0p6IlpvwhBITpwXRBm1mH+MtJEskEccmYaNT1YNO6b966q1ndwWmG4wqG8yXMOLAMIGnxTjTIpRG\n", + "9a5Z9Xdl+HR4ndQhvFfQ+mQNsGUdDPAaOtDr9NfsDESdrHz/VFsWMxlbozv6ME9/FBsTE8SLTZxK\n", + "uKA7LtdEmFdsikvrVwkDRWs6mlddIWSLEJey878D400I9Bm2F1YzYF8hIer8urpKTRWH3dl5Pnql\n", + "OkpPyvm3RplNwN8DaGYvFB3ajEHHx79ej7jTTF7j2dZAVPOuzAAAAQNBn7dFFSw7/wAYtYg8t2YJ\n", + "aBl5mT7LoVquTMWPsAY8JEk7n2Ltj2VU9Y6yhnUjGblNmyV5I1tDP1WCa31R20KBx8ZAPYjEjgAl\n", + "IBPsF6gwEF1mGQPgwIt+DQ7Ltrn+WWljoOZe6qmL3ODaEJKUCy9wZy8Qi5WMsDYzpEybVU1vipuE\n", + "rsjD5epFom/S3CRpP+JRc2SuBGV9X135AtKz2dAbEFqb0f/DUfvRpyE/xar90tpMsUisBmDyfPqC\n", + "QCIWsyVA62u0XX4SHuuo3VkmdASLaLWJS0hWsThucD2h8t0xx4j3t8tQeFkAoX+vhWm72BA6IAOh\n", + "cP5AynBLYvgLjkBSaw6ZAAABWgGf1nRDfwAgt5i6arm7oDsF+i9EHiOJ6m6rVkYAHTQbG9yseMuo\n", + "2+jJx58xpeovc881Wv+6nIPwZiRTONb2IQaBwPwYP/UAnKjoweUWtNn8yjj61Yi1F5n9oYReT9vo\n", + "YNykd6+UIhqXBR69VB8JEqms6DNcB++Z+7S8cRY1PTjUFRAm3tXpZtcqOC46Yje8Z3mZdWtke57d\n", + "wfIWf/bjH+PQoHPWtMGigrlGqEUElC6TETXz+nB7X3pF40yVazdjxa5pCPS8j1Bqo/RmILtftGxN\n", + "Yu+1c8QTzG5+3qHYIB5lZeEW8bNhQmHlV1zck8pKhAWM+UMUo8Yo1gMDIjGuUuNGCTYOoVand7oO\n", + "JxBESUm+840sI50gEtqO5mhNaTQVfGrhYgQvynil8I63rBmEOncCHtkN57Vx9gduQDjk6aOyO6bY\n", + "qsBt2jiwg3SW9pmMOjEKBDS6IfMiAxcAAAD/AZ/YakN/ACK6K1xrl4Eswd4/m5m3eDoe6aKYRGzt\n", + "qScyJrEz0/YMsioeM46osJc2N8un8CXkVjpps6zgsf8LlkG70ab3ccrB+um/wXzisesiYCwJDgAm\n", + "D8ODYrLA2f4XQyaEvxMLwdPggFdV9SLGW7IaDs1Gj2MKL95CD69ggFd4PlXdr+MMXaKnRfCfYej6\n", + "jyRkJ6YHIJryGsscniQRwJ0d+J+1KTOriJZQomY6moOkqhpxON7UIyt9lzU6HlHOyQJ+oRH5iOIM\n", + "+hKNz7H8znQxxv6dKCBY67rZbPlwYKywoLx2OIjAEQohlh7LdbGhKMy/zzEiJYFobhp2mH1gAAAB\n", + "WkGb3UmoQWyZTAgj//61KoAGC/pGgJ9CubE/Hy/U90CEEMEEbF2Q4cnB3oAeksXBYLQl6DX56J1l\n", + "w/mHq8WxaGt2MnAvQ41YNYO39iE6FvpuFKpW712yS65PLr83LJiqo7HZlMfRzKZN59Hb83g9Yzjb\n", + "LItfty44d54BI12++V5xh28HT7V7r0Y3bFC5OovybNWx1HQWDmvmM+uWQT6BKmA1pblkm0jWUuJ0\n", + "KAyepKH6sPnyIzz9TF/cTcVBDLcJ0ebq4QoNf0i/efDFq1nH+LtoZFDiLpeCwZkCLTOE+JMjcVxC\n", + "aWP/XfyRHhNANFDKtoVePLPasXuBVFa5xCh3bB99SWFmaQdxLlk9zHTMNOyCWoiRa9OkdBShrOe1\n", + "dfGrU6t4YEao5nNo7umRhNJMptOYWcUtCbSBQmV/4G3c/zgmpJb1N+5bNROg3nNApsFhNWPnDxXX\n", + "YEcAkKEAAADvQZ/7RRUsO/8AGBSepWN8xnNsxE4oE6H3s58lr1m+iqw+EfUFRD+Jna0+Uvzz41Eu\n", + "ATVBokoBIC1dZOqsBeTj8Ij9FIuxNitjsFqDL+DuZwvmGihDa0HIS79MTSVw/f89Ulk3p2M2jbij\n", + "TpCkIItiAXbWCZspatvMx2+GoOmu0/Pjqc6iwrXWXyi9/N9Jj+yY/ClUEyj7sTv82Y9nVf++GCrf\n", + "1w5ltOrH9rRQKpUQaVxp4gxcgxC4qFFOgMxs83r/WkZSqY9kO/9UmmCqExD/ljnRMUJvxp8FxL1d\n", + "H7PGv4WLI5AeltB+MOGIOr9NYMAAAADwAZ4adEN/ACG6NY+qIzQfcYKCb0AhP1JJtQboSZcB2Ux6\n", + "0kAZypUjTcd/OmJjJuZBZL4W6I8Qwzms0HJLp8KRrHdk5GfU6sWQ2Z+fhfAzgzC1XgPD4QBqkDkc\n", + "T0sPX8iasgf4/DARkJP486Pq1cqH5kOYBwnnR907+n/qb/xaeHwouVk6h00s/qlqepq0S1p/xGR/\n", + "GdINVBgCemrU+PPAyI+EQBjfU66sma3ahiVaLQtsD7mxr/vZVvwLqa7Chr1J9NZveiHKnAzIMG16\n", + "G9Gmkk/8FUHgdrIbZ2heuBDh1KQSBCztE11k+ocodRJkiMj5AAABBQGeHGpDfwAhujWPq8KUOIXq\n", + "Yi8pfsfzwlVQDEG6igccpABq5mcqZlBxZf6f05WsPP5oiGUHFHfSykAR60y9PVPsKziKYov/dHwR\n", + "Kft2Arvz4qT56TCewQ06i1++DP3k7arAvxqk9+C83xiDX/XWrTHQ1+jT9fNei76g+LJLvs+Z4UVk\n", + "oEaQ3c6fXvOR9+Md7sWQeZnYPXpC/0w6s38iG8bM/+n0jsTdTFeBwE6YfrCAsv/ybSEXYS5eoPM3\n", + "f/HRzfWrUb9MZw2WEuoxs0K4qVyNiDTxcyb1DdadbkuzwkaFG7T2ZM6Pebp0YyXRqckmxx6YTGzB\n", + "LlKwKmWHeooj6Lm9LlzVgQAAAaFBmh9JqEFsmUwUTBH//rUqgAYrWZggqZs1s6MH6FUT684nhne8\n", + "ykZKf89h+0voVegpTcVlgsFoS6xwNTcMDCv9PiwISM3bG5gmdpPxwsd2af4u9VMbVGyE78HSQ5M/\n", + "nbkySYm5CPjed6c1fzFNEjUv+hlxYNfv3cPYnGT/Yav/5erFhxatniKB++1xw2wwwm3hwteUjAt3\n", + "Bi79ySg16ijYqJM5fa8+vosVJZysXRlnbW7/ITdmkkl3c8ndruo8FzJ7m8m8z0kOYciXI4QIL6Xh\n", + "qroOcvOVcWB7Uug78ZH3AowGQXzMbzVMrLD5Q7gJi2vHbYwWBG8EpVzYFtaj2m+v5trtiq/wJKtt\n", + "WosqXvVBFnxrWYQFjXg41D/ASyQHPzn2WsqemfWG6/EDepgeax6MAFQfxyDScuq3fNmr8jf0net2\n", + "tjnK9AbUeZfaZDCLHpnptMZuk8clMx5Y+UVSA4sRK6q5yL86vVu3TWQ+TGs9ZFdT4m8kNBPSkwSz\n", + "rQpsGSml5JPzqe84pJi6yJhqfYRsb2q5mJ8tkrUntJCF8lR106wAAACuAZ4+akN/AB1RsSI82HuA\n", + "EDVZr5mUHFl/p/ZTcmoRWj4TfRvTsYw8OlDJB7dvZ/vcXyur4LGUumPqBQUBQHfGq57+bI/8tRzs\n", + "Z+nHU7WH8qJ9BM8/NBixjH12m2oVcRb4XvfrX32V+Y0hU+0j88MNPEcdX4rv7aeeep8jA96PadWJ\n", + "mSmtmcZfJIFp4fz7nGsOeHvsRUbV0MKDUYmKN+mrh03bThLfJGXI3U9Tnh+UAAABmUGaI0nhClJl\n", + "MCCP//61KoAFm+ceSLbmAtKM+jG0tYuAZBSWLg59auQBOS8BoT1gHMsjZkIU234iG6WAeSbLJEu0\n", + "KCLhFA+AqaJQGzw142KKgdSAFtORqvq8YepvegTzCCnS1DU11oB/GUVDtDnboQEryLd0x6NUSSMN\n", + "cECL9Mzb9QebAeTbVcgtE4xPKr7FEgVH4vbNIioC6rYN5svm+n7fErwoxd1c4B0MbzpTJ9ypWCIt\n", + "jDqP/6ecCXKe8Ac6gqcpyPRaKmFcKdx7byHCFs3Y36UHxsmpasB5iKonQtfou1T7ViPEDD+TNshw\n", + "6ncI9FQOyx3EYxNs7CdmXQjjuiQ/hVztgan/8HWeS5jp2zgzBv5BXUEnWn+A7+FBONSn2LL/uQ/w\n", + "xRZTcRa0x52ow/V5cvgKu7FATp/RCkX/G+w1Qnp+0VyZbVkCutQ1yOnQYxf79Uw65C1zWPQdQMP/\n", + "K+VS6vPAs27IKeqUeSeiBKHv/3isIgE+rjxQbN9Lh1YW9R/9r++mSeHrs60NzUtdlXFG/VIZkaKd\n", + "XMkAAADXQZ5BRTRMO/8AFlm8HmElw5CLBq61UEezfOfwLuaBDj371pFQE2TaGfrDL2cPvWN1QZqb\n", + "tmH36IVd+buOk4nAS7OK6LGtZWekVP+ro0ezqUL6LNjplSKI15AkcuTQweCsbYhrSLoTsRiawYgs\n", + "mv975sfbTCY9L8bxROvDNcwG30R1+JWvK+o/hwf/xA32LhBb08HGKIsZFejSCR/ZACyPMiASYPKQ\n", + "KnKHiabUDVxwGq+/saT475SIsPn2KAHPd1oy/JYI5la+DZBAp1lqCWQj4yUkciIB5BAAAABzAZ5g\n", + "dEN/AB8V9DqLglnogAnlbAbcaeEM/+Dr1d94BLu23/b924ZA1vKLZ+NWO2PdXQ6go3Sf7NA4nwhe\n", + "Jfk07l2+PnIu+kI9sd8bYLUmTTByKGfoyEUnQqTPIf5dfjB+AgnVTc5y8pWcKU354gRsJCt4lQAA\n", + "AO0BnmJqQ38AHxX0OouCWHEND0XeNAIAEOFUWlDAA6yKdnA6h0XJ5AHh6k3PwK41LuRgTA6dFitc\n", + "eGcLOFImUAXmZeNXd8BBiP4Y7WDb/nj/8t7UR/ChuIYJmbMzvyMcttz9Od2nvufuLeTpnnGxlC5D\n", + "sKIQ4TiAF1Zf6Jjc46nP71VK4g2t6fmiQijizaslPXbGXByTezIrwT4YraOsiMH4GMwabs58JhIR\n", + "tYealSfNunZO0jU9FNwqBbfEknuQIRSATwmWr49+JU7MtkfWDJ9lAsDVu2W/43LTVqxccM6dY8NC\n", + "EBnYMhV6U9uYbKYAAAGwQZpnSahBaJlMCCP//rUqgAZTWZgI3NAzNytjReukCJhCqRIQrgVE5TFG\n", + "RpO1ZRhoAw39KCX0FTF/pEpCWlYTREK0RX8M+i/Zkz6IOh5zRR0GMJniH0SeRA8U+ZBIRrL9Hl62\n", + "8kZwKv6q5Netv/8gTYt8wrrWIwWANbXHJaruY4G39urxvB/yx7ozBV54M/wmK8P5AgF0ljjPQAUZ\n", + "DnLEHwmopi3rWM++lGz+7pSmghGU/3PNF3AxzoRutm1cdRdLqAFKdPRrKeDtflDHW39dHMmsizA0\n", + "JAD4HEW4vO3o1CbLX2IxlZFPJGuT1QOtzPR7lO7pJCxfeGJXFchlosXXXbYjZoXRMBBKcHqbIWa+\n", + "lcjl1FcSEXbk84/WCNR/hEiDPBQ56Zc4Yg/Uu5te5H7B3WBkQkc5+tttienjQao2TkWT/tLarBIb\n", + "fSMA+83k8gbv1oyeFIIWqR6ZYarMVbzfFtnH/fWhWkYB/el6Kk3P0OPSTUOVwdEnhQ/ztu0l8Ij9\n", + "PRLg28jDAaygyMt+MtthW/hM1h+aETPrMcrgZoJoV2dKCm8mLdDu/CmksDfLJBRBAAABQkGehUUR\n", + "LDv/ABi1i6Ag4bMBZUwXqVJnyx2PYc2F7FCjvy82YHTp5//HJrbZhCcYERymRfl1ah1T5z9noaM6\n", + "FqCYiKh/nb1NKcv6lay4yu1An9EGWzEXMRaTXWcwehWRMZky6GX2Elv0mAOhcWIk8WVG2FWKKMhd\n", + "27a8KH0mx5CnVDu76Igw2moc1+yPfDPZnRGymeVWDMSj1/TY3hGgb5hmSfANHPp4nyrFETtH62Dy\n", + "FIZnfZ2tua96PI/858zqXLfYaSaEy66elRjPHGSUQ+kLj7sT6e2TgQoh23asg1dvl0lw6aW2KtOQ\n", + "yQVjdxBZzehiTDj2VDDo/FI5LuGH/jfe71B2giPdfSUEN0GwZPmh+oBJ3YPtBDdEXjvqGtPnj9YN\n", + "o2RsGDqkSW3oa8BY1cptmQPEHp1SMBrX83w6xtQW5X0AAAD0AZ6kdEN/ACG9oBtcoOCFYVPj9Yn2\n", + "v/zfoFr4rWL2j9A7ZlqQHr0ZVpbLuAQJB33EyTSBNnFvVuljxMl3V6GA7Dl0BClPwL31OrTpG1l7\n", + "a7ghzL0atyS5ApCJWtp2wOBNzezTQ3N+Y1tH+luIT/i1PP0KLgniqnzZyMrwKfZeXoYEIl7twi0H\n", + "PJVeAcAdd8vPtJ2LywfKZ3u1S3on0S/4f7cj446r85qt7SkU/lr6c/+gK5erYXiPq/kf9oXoMNwY\n", + "9h0XgCkkY0ibuAMW3BGf/tJy6AGuO11Q5hQVr9nNkIcjB8Plen8B0nqwKQkOaIEp5QYqYQAAAQkB\n", + "nqZqQ38AIr2gIG4zhxx5qcIQ9c2Osw5+uNtUP7c8wH627Nk93kOS5kJwZOUsa/GuB8LSJPcgk4rv\n", + "NNy4X5Kv65LRXZpkjxKOzss2V4BAkHf3fdjwk53/8IYs8s8oIvwVKvgR9wljv8Ag07Nf+XJo681q\n", + "NbSzOUK6bv18ql/byQhgzEpF9gyeKzBYpIes4Jq5ygJqsHenGCQnuZZGCejK/v7YZig/zrXj2vhG\n", + "gCib7VW/rlAZYnZRYtYW6jN8+34R58oAelpNik7qpp/KkHdSQspzMHjVSAa9yHgI/KVEUfAeaSTC\n", + "N1Z3u1GIF1TdZRU1zNyC6xbuAxPXtz6Ez91WiAF1zBDEIltBAAABt0Gaq0moQWyZTAgj//61KoAG\n", + "e1mYdETW3g4OxfplN37UKMHTaFqDxb+9ytAjpKDc3XnMw/MxT04D0MH+PToJ4KWEuN7AocErZRv2\n", + "Rz2GQBbpS8lS31542pk6xM8YYh0/yeF1AnMnBxO2+HilOPhojFg3EW0klIcf/AybMYAo9NSuBD9C\n", + "s4e75EU0t8atdvYkg/yfik+FMNyFYTUg/mi4EKL8VgLWVSi8mxQ1+/EWE53/+fwb7K+j+527pMW9\n", + "VCj1B/8oEXG8oxyHRw/TQGPoBS7lGz9zLwh8gXusGZBvY9Xy0pnRdJKDkZLO/YjZFLNiCRPsHTqL\n", + "i2GYmJ9itG9pRnevDN9cAKQP0fgHBe/nvlXFVK7JMen+RKub1gCuPtFfO/y6rA2fstwepz1bap4Z\n", + "wJXzTLHNbeZ6/jnjul1UTQDo+Wyv2+WNy23qAxLYAQV2nquSCySITwJSTVvg+SdePIAmj5UPClGF\n", + "OrJIf0RX1xfSrhrpF0W0EhW8ceypgG4+dXb+bPwXKBwbO3GymyW89X2WJwubd13etWWTwju8K204\n", + "+w8LWTwxqMyJaP52mExMi4W5Yjr9AyAAAAElQZ7JRRUsO/8AGBMzRwWmJT1LkI6j4SrVrkQOkpGr\n", + "7qmVB6agtU/P7NMI3vz5LIs62lee9zlMDhLgStRXRkKeHaPAGaY9hwFwZg4RZnlEijsKiC6r+GA3\n", + "jOJMGPR2G+iEvFq9JqYdk0b1d9ABTX/7oiMKav8zTfVNhhkqe32oj6u1ioYXU2U/9Y4cH3f/N9Gx\n", + "JhjbFALTGuJMdeB2a/pmxPSRSx2DhwUwXe3BT4iK5IJF2QdQUjRydlTK56i3AOElSAfT6NVqnLr8\n", + "mfbO/AiWtC7ZCdSKqLQrBheoCisxuwRDc+0Qj4IlPLBawyneGpiLaece3KMzpKTos+5YxlSYlKtg\n", + "/Me6PG+fH2sUI9B09T2Px/9ucFTXTUC5j4ELLv01D5MY2VAAAADfAZ7odEN/ACK9oCBuM4G+uLMW\n", + "L2dP1lfTvDhmlpluM7IE4yEUJKicqu4KM5OijIBGmwd/fv/FYUE8C16mNefQ0Uy/D+0+Hpx1ZFAP\n", + "3vl+5XYGW/hV3tVz6fpDmClx2VYPTKI+QsHyxc+qQa6raGV2rQAFnERDWDAoPELDpD0DBzrtQ9Gj\n", + "f1X0zbjtJNpqrwp/hRbaIrr15pQNp8wHXKVl3vyz9d+FD2rUtkJQVzj6V7XpNVWdz4mpDYH1JRGS\n", + "i2MURr0RotwXgP3Qnz/8L/EyxM0Sb/CNWw8xQFPmbCgpDwAAAOUBnupqQ38AH77opN4Quy1TZxAA\n", + "Og5d0nOlbRa1c67qPfhIW7P+8Av3GtFE0HFQCvcwO1xKybwlnguY0Nqo5bzwqVZ4m1UebapfH7JG\n", + "d9M94gSTzLBzp+7XrhnquJ9dwfh5fBCyLWBt8xSfTcJZr1HXGrAMOw+Jv+pCMMogCsMVlWbHeQuT\n", + "mD3/yuQp5lDob+9AYNdyDEIT/fV+2vxg/LuQxTIX08ne1pWMu28zMsHEcHxols+2LTEYzIWCi8BU\n", + "K3ZtJRE3rAjZxLOQ4w3m2m/D157HitClmlKcP9jJchoyWV95Jy2gAAABu0Ga70moQWyZTAgj//61\n", + "KoAGg+KazAhO48Rk+mELCfGa3jedcL7j4i4wMKqReszSNQj5h17BpSVMT9hX+zPhBrSs6Vj7HyaE\n", + "qm6lvw7kPbwwNhW67XEllpB7/AB7Dtmc/Lsrl2N4BzMZzIFVEJCqVkWDwHz0DCyLCsDRdQx8uGEg\n", + "Ikolt9wM9AgzvQ7TxR98jTrIYP8SP9CCVhDDASOwwiUKcH0pWRrgAYwjw8Gf7OlbogYj/no1BpFx\n", + "lYglvem+TH822s9SIsjJ3EA1IN/sTGSWgAXqwMREDl6rGx1E4un7krghrGWUm+/7j4jDoGqrYrQI\n", + "g7E+ktnqOLNELPNyQd8WQ/umSuXC1xL1umwA8X5+yPqMMHEIeQL1fzz/JWAXyMH93QMSzGumbhKw\n", + "Zwg0U+25Tvu4PnK5VQHbV0zvOU2Pj+MGf/nsDxqxrqZsD9S4YY9rcTfMxz/MkkzIgfRGQF/OgLHr\n", + "joIjF7P6XCeWe+XUgCwqZQG68PRNzfXkn+zUJpMMk0jjnoYnDkQ975Dz0Z65i4o7OdZtwLEOfaoE\n", + "pB0fo5td4PyA9vYIFlRo3xi7uvrQcih7/M7KbZFgAAAA9kGfDUUVLDv/ABlUeHLsmGHl+OQZEho1\n", + "hMDtEgrgr/N3AttUVM/7crMT5dwlm5uvzGVCn6w/p670sqgr5PJ6oiWC1npINQXp4CRzsctCmXzn\n", + "Ugai5K7NbwfaQcfbZKrjzT/10H2u4nhhcuuZyNqUHfbG94mETU3kKDy9A89Il0BA9I1A+R3yjNfc\n", + "+Nz5BwP3DN+ZYjka/GHLl0y68JgPyPoe9w8jyG5IXdu2vCa+LYvH9kU234z4psgT4qxlrdkhxxyP\n", + "UJXN8nPpx6cXDiQznv0L2owqy0csZbCzUw4CVJ98G+4T1R39bjI9WT0YHLigorskW6Eh4QAAAMYB\n", + "nyx0Q38AHP005LNxTWEpiZ1J9di26t3EruDGda0AVBouFN0G1ywEJMXJZuIMxrfHCac7PtwdnQsN\n", + "5ABPxruKApfvrd4v1WFO3Cl2Zd1SOG3/r1ORn6HwtueiSFcG0RNU2EL7iLFK3PfYpxwH299J2sER\n", + "9fENVpZ0Q3jjs6HsM0edV/QB07Ofn+R5vOS4TYLqhcaZAnuosw5RlS5g1Q8CuW9BZXMHWP4TGLry\n", + "nY5Y9ez3m8FrqVUEclyyvuywjGI3odTE+j8AAABPAZ8uakN/ABxO1hwOVZzdUT/5uAde8ARk7MlV\n", + "yrPTe1pSPQMTQCpdw5z/lBFmnGZwxWyqh+3IqkDkhpoxeW8ZCVdNB2x/1RnvvpDhcO3MwQAAAbZB\n", + "mzNJqEFsmUwII//+tSqABlJow5npTNmtYD16z8AGI7v0s/GnfyqOWOrIj7MzWLMA+5yFNFLu1hTu\n", + "dlbGlkD8jL3ONezhs0gurnHp2pFLsP3djo3BgKHcLr5q4kg5WMX28rT11jnIH4bHAuJDI0/Gub5+\n", + "542H8l9OurnbLu7ccDaau7k+AVcLYmIJfjhEaissSRpn2usY/14Z8WeJwbzUwclx5b0pufbMDj2m\n", + "E4jonmtfVQvsVKXSLVBGus9F0XUey7wsw1/Hxpa1Dj6X89JFMTZZDEgLc8SXNlb52uC+3SYuA3pO\n", + "yIZ3zYRDkwb5/sIpC9s/jtT+DR4JrFHAg/zOLQvdBHh2BZ/H88Qk1FOi1nkBwtogVwTsAvTRwaaM\n", + "L+Fy6Vw65xxtt2p06IrGo+vGB6Ev7rBsQ1lA5dJTwIES1/HSnI96cCqyJNRkq8io7XoKHq1jP8jJ\n", + "K8KCILcbnjTzWMILhY3EuZ8pRzEGblkg+ofcWDech+PkwDbk4flJvQ1eVGNBBbzkH58MbHNkp5C1\n", + "pRDfsnIb9VIwGZIgexRK5GP0EM8ZveKhcNpqg0C7EdFVGM7dDkwAAAFMQZ9RRRUsO/8AFKVU3AQX\n", + "TKYCKlUskM896ABcbpuBaq23+VbIBAleYM+Uh2fmC8hKxXufvA+Jyd8ERfcMKq2QBuOeaw8cG8nv\n", + "l00dW9FnZ2ewlISmCmZ99L0bw0GXPORXq89pSQ+5zLmGTJWLpbqXg/Gg/k26eFQ7yctp0OrjpANw\n", + "gpKfTmSwqfpdIyAO4i1HmWAczC/dxtyvK6EJns7ev/M+uhg/UBsLPdCc4ktjYaoFvgpYJl8v+SaB\n", + "iW6/qJFs8B7ABY+Xoa/3pJdDPx7Wo16RIr9F0VKx7gY2CroKhVZyesK3QK039pTJworswqeMoYtQ\n", + "SxUGWdIlnZAh/LxAqJSAgdbCea7vV7Jw7UJ3RZWLCaN03DO0g6FTEO0PNlB/y2w2d5hCS2yZtMLR\n", + "726poAjDu+5lgVHjodzIR1vHcKS57NpFhydymmBuCPgAAAD3AZ9wdEN/AB0F8+qoYAk/JkWPAABe\n", + "eS/K4R2z8W8rEZ4Es2dHO2B1xqZeWERk/2j9D35SD32hnizfkl5AQkKu7sKMRtxB0qUTg/5Ai8ci\n", + "ewPsEvh0cTnE+UnVVZQsy2FhpSkguxSgj2GzhV7H4B4oQdASRatW+4ge9XWWDwbNzKDfs2ikSZGn\n", + "ZK2J2cdk5ZNdF/NbhHS0c6vDp3S53pob/1OoP8UOX13YMuZJYtnSstfaINj9HWvrLOMusuMgy0ge\n", + "hr00WpqM4G4LNFMeeHMWs3VdDioqjp1BlI0pyKTUMl2eH+Urm0ENGx6u7gM90gDkOBdN7tgm4QAA\n", + "ASkBn3JqQ38AIb2gdaiDraOcmiaTmEkpG5LCwpD7mwoBhbYx9hK/huA/Rlz76MMOi96iXfBz3DSh\n", + "vG5XYVehGnggzBAkHfGgYDsO5F3SWLpvAiWuQYgw379rpdMwhqWoBgIHHe7UqoU3PiKCUX8CUwon\n", + "PUuq8JY4AYYztu7mmGelokJyoAJS97RU/X6H+RdsNNzitkC1d8I6jDPIy7qqN4tCnL3rY6Yesfv1\n", + "e8kTaN9S190RCoZyxCFd2JzsfgZhniY0nZmfUb/Ilr3HhSfAoNjT9YPJpZU0gCEN/XEjzBiwlPnv\n", + "oPqWZP16sXNdepP+5XR/WuewqnrAjpV8x4yn9rFVK/AamriL1xzzEUk66pD3JF3R2TNlp/oPgGf2\n", + "3Zht7rWDs3F41xpI2UAAAAHmQZt3SahBbJlMCCP//rUqgAZb4qaPpZ+oz92BGWQCHjdoHQHQRvdw\n", + "JuWMeCAf9SCNq3pRzo+QLWwm+zJnwkwndhEvWHQ/SujctvY5pe+lS1QEjQXzeizSF8k6tO14eAtl\n", + "F+Mync2FH/YIAKwBXgDqn6AXOHpWQcynHtaJryxWYm270/11pJpJLJP1UcyORiPI54DPlbzdu+l/\n", + "jiFd4hpdaoZTSIPUh6A6ClqPxEqekFrNjAxud2WiOSd4IE7Kaf//vpwZ0mh9bmck4Z3rAu3/6Cvy\n", + "KA3WyoqAFX4UT0ZjH4z6LrUYRBEZElMEZc4snCHRyZf+tjKnoDXWOrVFpzxu69dV7GJ+V1irRKox\n", + "Pd1LRXYUoYi+P14fumR2pYbtX+VBW+m+c7NAd8Z01d3TTKV7Mg7nTZdtCA/oFcETl7++5b2EIheP\n", + "k2Fg+5ToPyynpqzSsvv9vWMyfYTJnDg6PojbFsxSs0nRUvqnP5QCdr6QHBhWXFOG60F0RsLzEsNc\n", + "wpNcPfKeYjjdCfe8YUIVjq0PBSvcnC+B/ETQWaX7IFbWhPaknWILlx3KsiYwYSMVn5rwfQd4Jkdd\n", + "9H+fdht5f/EJHYCK5IGupAjPxHpu+QiB/iUSmCHkkTiMqsG8twzlljjsl22n8veAAAABCEGflUUV\n", + "LDv/ABi1iDy3ZgloGXmZPsuhVsylb+qqNi7GSIfQ+OHuoRwObuWCiDJsleSNbQz9VgmS3f493Q1l\n", + "fk0LSjQ0QBKQCe3UmCkV8vYYHcKN9CZn1L0i/3IstLHQcy91VMXucG0IQjYMvd5K4nw1TsRQ+zNt\n", + "c33OM7wT4gTiFbFnfUP6sORkbyxKD8+9VWHRCKkGnoAnjqhwkHV3YzaNKz290rB0XwxFDvsi8iqf\n", + "z+DNrf49LxpvDCniJY8b921MDAhjoaXQisEELwuIkEG2MG16iA+xn4KZIc8cifkUnLKYTAHTEosc\n", + "/geFGHZmG9d/0Ad4ehB1+UFj3eeT8gc12jWX2ySdSQAAAUIBn7R0Q38AHbXz6qhgDdTYSzAi1h3K\n", + "16Xr3JTVUajJdHP4n1zwK/61yxZ9pP4QSRtJbkJZWH6vivN5vckWYfjVoaQoNcq3qWx+bI+OTtrh\n", + "UNznJnNVmMngQpK+748FuR69zyCunCVVntkmuIrtQvOCVbqBuRz5Qxvz7t49H+VL6IAp+Rh2gf74\n", + "0j/UPUfosZ/ElbvCMu7rvOP7cWI+JN6KUOE+/AXQCyHGSkSvvSc5FsX0fFal2fQXaEkH67EHfCc5\n", + "xhdseiByl+PiqAs8A9zuy4qmXDeeIj+3Yojnw30fZXbmjymzKitBenCylofDP0QjYedpgwNVFWxv\n", + "pKDrpf57i5C5JHBxrkMOZNs3TkoKjfQLvKDT/j1Fvw02tHitRU1MR1mnPja0zhtM0e5b68dpKMZ6\n", + "9AO+761c+Ba/40Js4HhAAAABBwGftmpDfwAiuitca5eBLMHeP5uZuF9cX0/VXhqHcuiBABGdnZlB\n", + "vvbdh+1A3f4uQyVZizhw70/9zDh2nx3tQGn11M/7g3e0ETDcFJMpuy3pyqZj8OhCsFXcJg/Dg2Ky\n", + "wNn+F0Nd65xqPmrT4IAWVNyWgNuyHhWrg80hH2qe3n3QFTH+AG0t1LUQWRwdt8cDbAi+8IGZZrTn\n", + "QzKAGB5g+jkMrZS2t5af/14Dikh/TUO9x6vp3udUZwfEqX9x43nyKd2KkcrjEt0VxTQ1LHt4TKTU\n", + "ov9g2wymXIrIg/m2cGScMEoY8xa4E2v0IBu8Siv364Oh7cF3cjWG+ZJkZ6xGCUsmpmsJt4n9AAAB\n", + "cEGbu0moQWyZTAgj//61KoAGC/pGgJ9CubE/Hy/U90CEEMEEbF2P5yKT5EQsPLolJYuDn1q5ANTN\n", + "SJwpmVcvZVK2Tco4v2Comd7hwZPuuXhX+lvh+l6ZtjrC3czf1ZVbdumb3r3D/ioYe7qcFNf7aS5r\n", + "2YnlPFx/ox3Po4uR9L227Pa5JPu/JVHojzbyIvC2hUPLYoK3yo8EFTOEx9VW2Kka/dDqBAClQEXM\n", + "coaHOVrqvWOBlx0SmrR2Fn5qD0ttjA+wKyG9Ww/+/fxdGsIy8lThxbGnpYEDoqIDxAPPdyC1j/7C\n", + "x1S6SZ6cX8TWD+edELbCVScHr4twowGayNRkN1sGJ3ChzFZqefnm592USWq1KVPalCkn+IgAbkI0\n", + "gf8crEnxuQcz5L3ov1loEzryk4ptgt40vN/cUUrwi49uNdXDzDlba6ntBbOYIPKYQqVbRsWX//V3\n", + "7VjjZzb0fU2VitbTbNlERmPP5obsCvIRmiOfAAAA7EGf2UUVLDv/ABgUnqVjfMZzbMROTbEr98Ov\n", + "G6hTv8LwbEOVBTuoZFwTL9eOUuW51yt7Pk5XoOwvCITHjPxM0+ACPLC5p8LXGPLXOMFwxyKNAOm2\n", + "+bVnL7eC/eonqWYHV7ElnGiaPE4DZvhksvIAUMvT1hgYsLWg5pHxPTMEf4vPc7k/U4gx+qn0dLIb\n", + "xLE6WPqhOli4SJOCHhekKlwgxlnM6S8wIxjTrZQVP6tyjUXc7nRDpn5+4xHTB5JTQd/Y+v5uYYim\n", + "vSxL9Lp9+sJa/YqUqQ0UFcQR3Tlp/PCrTJ5gUcQmlTDSjEV8pdpwAAABAgGf+HRDfwAhujWPq7Ze\n", + "gCJPvLBRhSSbcG6El3BFXKqbl3V6+XLJCsWmxwO7Xskzh85D3/GGBbxCjXU3okqTeEYfyjkOl+SH\n", + "4VGFs6uGeBXI6FuyUdCktochZVIQW+D6bukSQtQ9xBoZWqRH4hlWFBiT6bV+GQGerlgKyeaNsqD5\n", + "s+IDfM/wce0dikHUV0++Nr2rHe3jcRRrSy2FHjFSMdnyldmaj1iFauYYGv6d3l/8LPJtc5g5u4Q0\n", + "WerxF6DQAN+WlQUAod5dWuqnUKOySujKDQh4Sh1bNoaribkhCngsbjiJUpnyDzJfWcRyF47YB87L\n", + "Omkfy8ijCTvweGsJYAgScQAAAQUBn/pqQ38AIbo1j6vClDiF6mIvKX7IDWIXdy1QyeJm7hwAhKrN\n", + "5ZQTH6lrtJ9D3xtslHyvy2ywnd5a5/owLJHRc2EtkPadJ8Uji+G9O7CT6ooBM3rAgAWaKgWADHof\n", + "Rk55HzZ+V8DMw4S4pnRLudTRFnX1DyLXHV3VXMnhAeP+ewFDtdkUHGMhcSI0U8KajX0wWNdBGeGb\n", + "D8Ns9BH8mxfhSu/SqyYkA2AIdaTRVyL0w7XOVFH3DXljVqrcwMdXPvGgiBcw6chMaLbepo7nSmh1\n", + "vAbwAQYruBhNTN0eawky0jofbme4HocI40c1sz31wjy2n2/uelK4XikXYFYmVtl4Kdutz8YAAAGb\n", + "QZv9SahBbJlMFEwR//61KoAGK1mYIKmbNbOjB+hVE+vOJ4Z3vMpGSn/PYftL6FXoKU3FZYLBaEus\n", + "cDU8hX8r/T4sCEjN2tKC+to/+IoDOzT/F3qpjao2Qnfg6SHJn87cmSTE3IR8bzvTmr+Ye4Ac/+hl\n", + "xYNmjmRG01XaPV08JLNnbV2zuL5cn/7CsR7I4pKAadGKE6UheVLfqn0i791ThTaaO2OCRjsSWF8e\n", + "1o7SXLcWHdmh1WCFSlfjet1S/FkIphxf8M1ZQjLPF96/W7wlOpiP6jEis8o6251YpmdqxS3VSmv/\n", + "s9Bv3ISLvkMspiZj+iQwr28MINay/7syEY2A7ZiKqNUJX069yti8CuYwd1gGvQZSlufV+auVaTNU\n", + "xocXs0XuFW0e/AWENf2i3yxrLFTHW9CCBeoKH21CafAHq6hi+H/e9DkZU77nSidgvmP6DIx/XjI4\n", + "Sp9anaBxYwcylzQtEH2XN+nrwpDPp45KYG9LI0xieadJ2QOTHIvADfNhP/PY2gqE0NQ2qkvQc0a7\n", + "Xw6JCi5LfZz745MNAAAA8QGeHGpDfwAdo0DVwAgarNdw1dyEo22Z+2voCmn3MepWOJpNH9uE22Fc\n", + "UAf4fo25DS3VGYdH0kZ3bYGxdzd+R7awrh1yiW2ItRU9+fbZ+7eJ43X/1GQK2tLeuYX+rXNnNYVn\n", + "3JiyKGKiuk48G4gEpBGTo6LBxeBZg0OXhUHfR3yB3h9X56ir+g4EbNusZoLNQh23BaGzc9/s1PO9\n", + "1PPSEqrUiAosSTAygJNCJGqMs5yCqcS+EZopY3ntHhRp/rTMQhL4aAxAb8XQkEJtEmWrzD4p1eX6\n", + "QEZh/6hTVX/Gz191R2H/Dtkpg79J3GkssFm0vPkAAAH+QZoBSeEKUmUwII///rUqgAWb5x2D2a6r\n", + "t0Z9OpYFG2tABdnWLgsFoSkhKeOGdpZQLTxZJNtdR1o3VEUaCsJe7TDcWLiNBjbFk4iCHCNTwP1B\n", + "ET8aIdy/mqBaPrTdtuT/6FMRex7yXV0X/b0t3IdDKZDeFLpQzjHVkdbvbm3BNwCciVQUNcJ7Sjbw\n", + "T4hbhPp0oEDMMYqhG0FXqi8cqsDNhwZenV4L974lIjS1k1BRVCVuxIwrhHZ+ZNeKQOVccqtyU7fb\n", + "1nmmkdbnAEav9V5tnQTxoYHQvrZLL4f7C+LE0IOtSnKggNbex2Xp0FNi9T/+fjTgmF5bW9OJ+WCx\n", + "leyLvNiQF8k0bwSPMh7702+7OB9yXypsT0VFN+3fNlolLg4yJ7ye2ijeDcs0TyR0KI9OqHHwk9VT\n", + "lv0R4DjKMuNtxv3yyDdQ02ld84rRe/IbVoqtujoBlwArv27SRkTybmrwQddynU1vfFNgJ2tkTxsX\n", + "EuhAyTUDk1pdyrePvO3Kyjq07E+ZdqW1unVDCL0p2PAM0Bdj+ozOm4QJPGRq3YEQjJpnk1BNx6E0\n", + "yZMxRvkyW2tYZosgoDR8rW5jEN/sH3PsICgk/jLYhgpsvFfXxjf0NPxMCt81bgYfKxBAoUrGuF/8\n", + "Gb453zLMx96NgDfHj/3/yVULmADuEWX3e7X8vwCYAAAA+kGeP0U0TDv/ABZZvB5hJcOQiwautVBH\n", + "s3zn8C7pn/fvWkU93yxomewKAdw+9VXghKzj8nMy4EQ6n26QhvOvN3ZOGl4wrl9GlrTzWwgssqXz\n", + "oLBd9XVA4LrC7D/kDb3CEAYvcHCWxuhsk3WHFeLlRhwB95RghbDR4boSp+CQz3CY9L8bxC9Ohf/r\n", + "dy9+xoLX1H7kyaZJ3YehTdM+5Wu6Hpc4XocPo/ogFns0WlfgVPekkiZdh228q3p+OFEAyCsprsbc\n", + "bh4x6zwYau0C11ECccZga0PS18ku4j08dAfMYirHksImmVD9Aw8yto6D9YLwntF8IaA+FPG9VagA\n", + "AAB+AZ5edEN/AB8T3aVQEVcYwT0kXXzzDP4yP2lC7bONTcb6acU9HQ87UdrkSLI4+OHKFlU0EAFz\n", + "P/GPhcZ5NOIVfnz6vsVd3DH3XZLg43PF1cMypwOcG8sbzfthjMA4FQSgVvJe40X2MhECJet9t2G/\n", + "XdWa+YBzkUuLdbRPBeGTAAABAAGeQGpDfwAfDtYNiNYeWLJ1JGi8AHLac8oZrJR5tDRFy80bn36g\n", + "01RfxVuWBDFeUQUU4VHoswV2zHbq6MzAloc0SM3f88f/qXApn5tj32GTO8MmdjG+5h2BlZLr7lVk\n", + "BcTdEueULRCVgGF4dFB9PX4Y3jYyGQfKH/BWnAEfbs4hEQ8ebrGB8mSRpcKz5q1oNG7pkp8qNfsq\n", + "nkhG1h5qVJ826dklpNvhQDQdQnVi0zusZWH7g9GItx1/0euTzo8U/z7D4DrbASMUmgB0DC8TSqJd\n", + "xZ+UMAYbubxMdW+iPv2N1tIKXHdcOVBHhDDt1MeY4rBQavQwdjpFZBiUMt5ya+AAAAH0QZpFSahB\n", + "aJlMCCH//qpVAAyk+dgiPwCMdRFSufgoxGSIR+/0rSe9Cp9hy8WpEfkfjpu1RSHWd3zlulcFC+Nh\n", + "XPR//hjTft5KlTxkfWUjrzSX8Q8sCzZTRHqzVvb/rscPsXHQf0E6taB/yJXWDm9ZR5fbjX3mwQRc\n", + "72p/7Nk/lJUO//4LM1qLgtlckFFvGA4aviZYHpBb9w1OJg/Jqwvkkixar7ua0LNG3ane8+4yu/5g\n", + "n8krsqxREhrpsaI39b317zkKj6KVaeKiNvQ1KBsts5QsX+yTO1tzmbv5PRxGS8tz2hKf4zB8fbWM\n", + "XhqB6Gi2mMVEo6jXnv5vErjT3e551EcovqLpcSnuFBTI4jT6V7ZqZq5zqsmn23ZqFTbBnXJfy5qg\n", + "Xc1RIbUSG7SAPcicWIbuNtZ4GQS+WKAEZUxr++6VPQD3gpW4BeKCxEy910wCA11VXaqCgcSgS5FA\n", + "dwACIPfrp0NhEyPCvA4qNFC9NitDM1I8HthEGAjfRL6imFuJfW4+Sk08ZcO8JNBK0/bkkNG7XFo7\n", + "Hs15nZek/o+FGsRiwki6FYqc1HBc8skTelrrFiYgicL9M/ehriAlP3GGSVQdD58oSyTAbR/XOwHh\n", + "/k7736bu5rnUg2SpAi/FdrWUFq0zx+C7UUDgbK+SgABs/nsA2PEAAAE5QZ5jRREsO/8AGLWLoCDh\n", + "swFlTBepUmfLHY9h6nZJebQZXCAk5QrW0LEqJOc6Tf3RfmBa+BH+trXpxDsoWsYBGGxFB6vHNSw7\n", + "QTuHxSINvJ7kINONdsnA7unyZfe+/dUQpBab4cd9DfyyBJrHeEf61R0Nfn0RkLu3bt6BWIYQlYtM\n", + "K9Nfs/vIPwJSfjpXcON5DPtNNDffXZk4RydlgN+S/E7EUmDtA6DaeTT9v6cz5zUd9DSGZ32drbmv\n", + "ejyP/MmN69TJZPy1fo/BndGgtSNNbFsKVeTDjxqdcz9cfjIrJ3P86/aSSTu++gY85cN7L+QFkn5k\n", + "/lX20+90kKxSs6X+x+u7me+jslyG1ZQaBGKwDx+RwViiPwDARZocg2yGxzRByDsEM59E93SHlUl9\n", + "GT+PqBiUfn848MoVbAAAAOoBnoJ0Q38AIb2gG1yg4IVhU+P1ifa//N+gWvitYvaP0DtmWpAevRlW\n", + "lsu4BAkHfcTJNIE2cW9WUS6DTP7xEfhthE/Au7/XTkrYH5bPnHuWMD+L4E2Ys7TDv/WnXsb8WMjs\n", + "GVKLefmxcZqtW10iMABVusPZiYCVoxR1g16JAWeZ7iIjTKxZ0g1yWUY7SYbSh6LLTrvWvhE7lU5U\n", + "CdpswEmIpPdhoFfYojayY1ypJuWbbU1PB5nvwD9t85tVUeFQcQm5aN4kQawNooLXHpvRUW63Gqd8\n", + "iY0WiZheEXu2JHmP8XM7t/dfyrk3Fx0AAAEdAZ6EakN/ACK9oCBuM4cceanCEPXT9ZV29ukUDhUK\n", + "Q43qY97tIKPQ4ZLk+xSOgxBfQxL7yIrZscfkKmKCSoYxQfZ+tSzvOZ1GhW2ifFuVzAIEg7+77ixc\n", + "Kx//CGLPLPJ464HVUHGkhcx37PQ+kbQrXlUbN3cWUp0Qf4LtEibFhZ+LpSZJ4udEDKi6Q/S18Psl\n", + "/qmdcccWROb1W4f/Xy9V+lMS0Du/XhxzsIhWccm/rlAZXG9J5NMLdRfS734QHwqLqFpe0KPTU/Mz\n", + "iY1ev2MPDzHxs95uiDK6gRc1gvD7TgXhVki57ReTigwP0Vcnsm9mMNHj3Nt6/RMhlMwCLQhy6qqL\n", + "YC7Z58RnNbEutfWZAa9Y2SYIcplB+x/e/c7TAAABlkGaiUmoQWyZTAh3//6plgAykehDX8oAigHL\n", + "uS7e5BiYpAhLP0Zp72qQ9WFfih2hD6ViubvwAAAy+5vuYY1yi1tJuPfBi/DL0xvClymIwqUp5EK2\n", + "pijOf291KPaqRN5kbJjB/2wfKr1+XMiKLX6DysREeFfQlDwQLBucvt+vNOXQokOSOb4yTYfCyIZ/\n", + "GHqmX89FI8GoC7SVJ8dqrGOCOpcjHfvSY2QsrqBh9dhAV5Sl9v/BQKeopbgb9Qoepn/uEMh2fyEW\n", + "JmX+JgRFJalJclAgIlVBNaF+FoinY0YPKhqMcuoH+rtaEk2LTWu4NHdn9ysTAkHlBR2G+58hU289\n", + "8X49s9CJy7d2oeKmsapTwnIxxJ2LNCm+TxMniHit0ZHqI5VMxQ+5ZJ2tPHM7/cT3gdae3yVR8+YM\n", + "/KU5H6oISvxSd8TybIcXMyYVHn6O+gwy4SKx3AkMYLFpKRIO1eI3ZmEPll+L/2Ahp3aDBQxulIlY\n", + "Qc1v4+BSAHSjYxY/VpZwrkFkWgmuXijX9pnceU+eCQb0BkKKYYEAAAE7QZ6nRRUsO/8AGBMrmVrk\n", + "4p6lyIABr6JcUvWGXYV0DKg9NQWqfn9mmEaqxk2L7hAoVLAefd2AT3uOnaK6MhbcdSJ0jbOgAdky\n", + "1NCtoTFYEK1L3oNAlJW78V3WE6NttmJ67HTQFhc7jbPt6n2fAdknrF4tehh2ttPPRj0ZMNDck2O/\n", + "Og/0bAxzaaL7DSYz/qGCfH6ue/8E9mejEEqzP8HffVv8Obhn2u8eQxOotWj4hO+DblITeYVYJXny\n", + "h4Mo9PoOPQCtWY4pEEbVZmokYfc6NrhoTMJC8d+WVfQUp/9dQN2FtoGBhQPHEwvVbIcYhR7B4iO2\n", + "lHuM7fr8Nz2PLRQOuR4Lhle59+tgw9IpLSJGfVu5u0NIKILKM/viNoDYYuKxIDdR/J6apnFKAoah\n", + "uk9v6if+0v3ru/qsdmBBAAAA3wGexnRDfwAivaAgbjOBvrizFgvOGL0/6vuXDLpZruFaiDwd2rdX\n", + "jHVzx9p+aFelpPZGVUpD9afD05Q+ygH73y/cGcCL/wq72iJds0hr5PUpNV/aSoB5zpjnS1krIC0g\n", + "xgvcsTNLJd1aFsq1w5umkQK05c9QgDPa1eUOrMmn+/YlpdytXE6u+4FAjIpYVgn74StUYfcT8IT8\n", + "SGX5Wru0UB/4BiwZwXDYz0r2pPySvTt1TUg57ubb0S/BqMvEVZ5rArNFw0GaRO5EmmTuHjFK31Ed\n", + "ZcrudMiOWUSCfSesj44AAADfAZ7IakN/AB++6KTdC0Gg2vR2G3QAHQcu6TnStota0MGq57eEms8e\n", + "GSZ8YTYymFLgl7YZGG1YXmh3orKEBl6b97W6tU9/+wsf9/cg00EpDLAMwmuhlqrl+tcaP161PaCT\n", + "db1JjfLZ6rQlIR/u8Lq+hDMPBrZgZ6lFmsHEDUzmL1vhrC/Eg5wjH+dLR3xJpn70Bg13IMQhP99X\n", + "7a/GD8u5DFMhlFEykeU8M0AF5LVwxauGljyJ2PG9wt/W7GNjLNgsX4aFTR897+cKWdUMsr13pC8x\n", + "KjWMpGHXcQ2lKSkGzAAAAR5BmspJqEFsmUwIb//+p4QAYn+ayCPJyJ7QOf/irXuB3I7yUvrv3Wd8\n", + "OLQaJBb/+EMR1r6SAeh0um3VtQPrwYoZU0zDlMzZlECRYSRYOAqgamI/sUVWVEYaYAVab8QpucQ/\n", + "sSTh0wVtYsFYYkt/gr7uhkEpx1NPSuJ9CqWeDhMsefol+oaGZkPTooDGiCB29X8Zubhk7s13xY5c\n", + "l2KWl6cdQs8QOBu4PKBLJa04v3ctO+FHUCNJTXN7J5YnaOHn+BLPFy7A6HoUxVmuK9kB/hB9j6ln\n", + "0nykP3r6vgXJiVxtga3Ek+Zj3edZUHSAUux6bbxkCgdvPWLgxmKM0iIQ0SZS+9McjsqW/5Kw1hL5\n", + "sobdDT0GsHJ+I+IDODn9/vmRAAAGqm1vb3YAAABsbXZoZAAAAAAAAAAAAAAAAAAAA+gAAB1MAAEA\n", + "AAEAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAA\n", + "AAAAAAAAAAAAAAAAAAAAAAAAAAIAAAXUdHJhawAAAFx0a2hkAAAAAwAAAAAAAAAAAAAAAQAAAAAA\n", + "AB1MAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAQAAAAAGw\n", + "AAABIAAAAAAAJGVkdHMAAAAcZWxzdAAAAAAAAAABAAAdTAAACAAAAQAAAAAFTG1kaWEAAAAgbWRo\n", + "ZAAAAAAAAAAAAAAAAAAAKAAAASwAVcQAAAAAAC1oZGxyAAAAAAAAAAB2aWRlAAAAAAAAAAAAAAAA\n", + "VmlkZW9IYW5kbGVyAAAABPdtaW5mAAAAFHZtaGQAAAABAAAAAAAAAAAAAAAkZGluZgAAABxkcmVm\n", + "AAAAAAAAAAEAAAAMdXJsIAAAAAEAAAS3c3RibAAAALNzdHNkAAAAAAAAAAEAAACjYXZjMQAAAAAA\n", + "AAABAAAAAAAAAAAAAAAAAAAAAAGwASAASAAAAEgAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAA\n", + "AAAAAAAAAAAAAAAAABj//wAAADFhdmNDAWQAFf/hABhnZAAVrNlBsJaEAAADAAQAAAMAUDxYtlgB\n", + "AAZo6+PLIsAAAAAcdXVpZGtoQPJfJE/FujmlG88DI/MAAAAAAAAAGHN0dHMAAAAAAAAAAQAAAEsA\n", + "AAQAAAAAFHN0c3MAAAAAAAAAAQAAAAEAAAJgY3R0cwAAAAAAAABKAAAAAQAACAAAAAABAAAUAAAA\n", + "AAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABAAAAAAAgAABAAAAAABAAAMAAAAAAEAAAQAAAAA\n", "AQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAAB\n", "AAAEAAAAAAEAABQAAAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEA\n", - "AAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAAAQAA\n", - "CAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAM\n", + "AAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAAAwAAAAAAQAA\n", + "BAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAAAQAACAAAAAABAAAA\n", "AAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgA\n", "AAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAA\n", "AAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAA\n", - "AAEAABQAAAAAAQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAA\n", - "AQAABAAAAAABAAAMAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAAB\n", - "AAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAAAgAAAAAHHN0c2MAAAAAAAAAAQAAAAEA\n", - "AABkAAAAAQAAAaRzdHN6AAAAAAAAAAAAAABkAAAGhgAAAl8AAAFjAAAAvgAAAXYAAAHzAAABDgAA\n", - "ATYAAAFIAAAB9QAAAOIAAAD6AAABWgAAAbAAAADTAAAB8wAAAN4AAAH+AAABEAAAAOIAAAG2AAAC\n", - "DAAAAWUAAAGkAAABmgAAAckAAAEdAAABfQAAAPMAAAFxAAABIgAAAjYAAAEmAAAA5AAAAXoAAAH+\n", - "AAAA/wAAAT0AAAFnAAACAwAAARQAAAE3AAABTwAAAckAAADrAAACFwAAAP0AAAHzAAABIQAAAOAA\n", - "AAHKAAACOwAAAVQAAAHFAAABugAAAdQAAAD3AAABUgAAARIAAAFuAAABLwAAAhAAAAERAAAA9gAA\n", - "AZkAAAIqAAABIgAAAV0AAAGIAAACSgAAASgAAAFEAAABggAAAegAAAD+AAACCgAAASIAAAIdAAAB\n", - "KAAAAQcAAAHbAAACFgAAAT0AAAITAAAB2gAAAi8AAAEGAAABrQAAASoAAAF0AAABZgAAAl4AAAFU\n", - "AAAA+gAAAbYAAAHjAAABLwAAAZwAAAHBAAAB8QAAABRzdGNvAAAAAAAAAAEAAAAsAAAAYnVkdGEA\n", - "AABabWV0YQAAAAAAAAAhaGRscgAAAAAAAAAAbWRpcmFwcGwAAAAAAAAAAAAAAAAtaWxzdAAAACWp\n", - "dG9vAAAAHWRhdGEAAAABAAAAAExhdmY1Ny44My4xMDA=\n", + "AAEAAAwAAAAAAQAABAAAAAABAAAUAAAAAAEAAAgAAAAAAQAAAAAAAAABAAAEAAAAAAEAABQAAAAA\n", + "AQAACAAAAAABAAAAAAAAAAEAAAQAAAAAAQAAFAAAAAABAAAIAAAAAAEAAAAAAAAAAQAABAAAAAAB\n", + "AAAIAAAAABxzdHNjAAAAAAAAAAEAAAABAAAASwAAAAEAAAFAc3RzegAAAAAAAAAAAAAASwAABs8A\n", + "AAI/AAABMQAAAGEAAAD8AAABkwAAAMcAAAEbAAABNgAAALkAAAGdAAAA/QAAAMYAAADdAAABzAAA\n", + "AQEAAADcAAAARQAAAdsAAAE9AAAA0QAAAU4AAAIZAAABBwAAAV4AAAEDAAABXgAAAPMAAAD0AAAB\n", + "CQAAAaUAAACyAAABnQAAANsAAAB3AAAA8QAAAbQAAAFGAAAA+AAAAQ0AAAG7AAABKQAAAOMAAADp\n", + "AAABvwAAAPoAAADKAAAAUwAAAboAAAFQAAAA+wAAAS0AAAHqAAABDAAAAUYAAAELAAABdAAAAPAA\n", + "AAEGAAABCQAAAZ8AAAD1AAACAgAAAP4AAACCAAABBAAAAfgAAAE9AAAA7gAAASEAAAGaAAABPwAA\n", + "AOMAAADjAAABIgAAABRzdGNvAAAAAAAAAAEAAAAsAAAAYnVkdGEAAABabWV0YQAAAAAAAAAhaGRs\n", + "cgAAAAAAAAAAbWRpcmFwcGwAAAAAAAAAAAAAAAAtaWxzdAAAACWpdG9vAAAAHWRhdGEAAAABAAAA\n", + "AExhdmY1Ny44My4xMDA=\n", "\"\u003e\n", " Your browser does not support the video tag.\n", "\u003c/video\u003e" ], "text/plain": [ - "\u003cIPython.core.display.HTML at 0x7f84b2253b50\u003e" + "\u003cIPython.core.display.HTML at 0x7f1286b190b8\u003e" ] }, "metadata": { @@ -1209,15 +790,15 @@ "source": [ "import time\n", "import traceback\n", + "import sys\n", "\n", "from matplotlib import pyplot as plt\n", "from matplotlib import animation as anim\n", - "import tensorflow as tf\n", - "from tensorflow.contrib import autograph as ag\n", + "import numpy as np\n", "from IPython import display\n", "\n", "\n", - "@ag.do_not_convert(ag.RunMode.PY_FUNC)\n", + "@tf.autograph.experimental.do_not_convert\n", "def render(boards):\n", " fig = plt.figure()\n", "\n", @@ -1237,74 +818,71 @@ " except RuntimeError:\n", " print('Coult not render animation:')\n", " traceback.print_exc()\n", + " return 1\n", + " return 0\n", "\n", "\n", "def gol_episode(board):\n", - " directions = tf.constant(\n", - " ((-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1), (1, -1), (1, 0), (1, 1)))\n", + " new_board = tf.TensorArray(tf.int32, 0, dynamic_size=True)\n", "\n", - " new_board = []\n", - " ag.set_element_type(new_board, tf.int32)\n", - "\n", - " for i in range(len(board)):\n", - " for j in range(len(board[i])):\n", - " num_neighbors = 0\n", - " for d in directions:\n", - " ni = i + d[0]\n", - " nj = j + d[1]\n", - " if ni \u003e= 0 and nj \u003e= 0 and ni \u003c len(board) and nj \u003c len(board[i]):\n", - " num_neighbors += board[ni][nj]\n", + " for i in tf.range(len(board)):\n", + " for j in tf.range(len(board[i])):\n", + " num_neighbors = tf.reduce_sum(\n", + " board[tf.maximum(i-1, 0):tf.minimum(i+2, len(board)),\n", + " tf.maximum(j-1, 0):tf.minimum(j+2, len(board[i]))]\n", + " ) - board[i][j]\n", " \n", - " new_cell = 0\n", " if num_neighbors == 2:\n", " new_cell = board[i][j]\n", " elif num_neighbors == 3:\n", " new_cell = 1\n", + " else:\n", + " new_cell = 0\n", " \n", " new_board.append(new_cell)\n", - " final_board = ag.stack(new_board)\n", + " final_board = new_board.stack()\n", " final_board = tf.reshape(final_board, board.shape)\n", " return final_board\n", " \n", "\n", + "@tf.function(experimental_autograph_options=(\n", + " tf.autograph.experimental.Feature.EQUALITY_OPERATORS,\n", + " tf.autograph.experimental.Feature.BUILTIN_FUNCTIONS,\n", + " tf.autograph.experimental.Feature.LISTS,\n", + " ))\n", "def gol(initial_board):\n", " board = initial_board\n", - " boards = []\n", - " ag.set_element_type(boards, tf.int32)\n", - " # We are being explicit about tensor constants to ensure the loop\n", - " # is not unrolled in the graph. This may change in the future.\n", - " for i in range(tf.constant(NUM_STEPS)):\n", + " boards = tf.TensorArray(tf.int32, size=0, dynamic_size=True)\n", + "\n", + " i = 0\n", + " for i in tf.range(NUM_STEPS):\n", " board = gol_episode(board)\n", " boards.append(board)\n", - " boards = ag.stack(boards)\n", - " render(boards)\n", - " return tf.no_op()\n", + " boards = boards.stack()\n", + " tf.py_function(render, (boards,), (tf.int64,))\n", + " return i\n", " \n", "\n", - "with tf.Graph().as_default():\n", - " # Gosper glider gun\n", - " # Adapted from http://www.cplusplus.com/forum/lounge/75168/\n", - " _ = 0\n", - " initial_board = tf.constant((\n", - " ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", - " ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", - " ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,1,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", - " ( _,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_,_,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_ ),\n", - " ( _,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,1,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_ ),\n", - " ( _,1,1,_,_,_,_,_,_,_,_,1,_,_,_,_,_,1,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", - " ( _,1,1,_,_,_,_,_,_,_,_,1,_,_,_,1,_,1,1,_,_,_,_,1,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", - " ( _,_,_,_,_,_,_,_,_,_,_,1,_,_,_,_,_,1,_,_,_,_,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", - " ( _,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", - " ( _,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", - " ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", - " ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", - " ))\n", - " initial_board = tf.pad(initial_board, ((0, 20), (0, 10)))\n", - " \n", - " tf_gol = ag.to_graph(gol)\n", - " game_ops = tf_gol(initial_board)\n", - " with tf.Session() as sess:\n", - " sess.run(game_ops)\n" + "# Gosper glider gun\n", + "# Adapted from http://www.cplusplus.com/forum/lounge/75168/\n", + "_ = 0\n", + "initial_board = tf.constant((\n", + " ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", + " ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", + " ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,1,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", + " ( _,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_,_,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_ ),\n", + " ( _,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,1,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_ ),\n", + " ( _,1,1,_,_,_,_,_,_,_,_,1,_,_,_,_,_,1,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", + " ( _,1,1,_,_,_,_,_,_,_,_,1,_,_,_,1,_,1,1,_,_,_,_,1,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", + " ( _,_,_,_,_,_,_,_,_,_,_,1,_,_,_,_,_,1,_,_,_,_,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", + " ( _,_,_,_,_,_,_,_,_,_,_,_,1,_,_,_,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", + " ( _,_,_,_,_,_,_,_,_,_,_,_,_,1,1,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", + " ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", + " ( _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_ ),\n", + "))\n", + "initial_board = tf.pad(initial_board, ((0, 10), (0, 5)))\n", + "\n", + "_ = gol(initial_board)" ] }, { @@ -1319,179 +897,21 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 0, "metadata": { - "colab": { - "autoexec": { - "startup": false, - "wait_interval": 0 - }, - "height": 2323 - }, + "colab": {}, "colab_type": "code", - "executionInfo": { - "elapsed": 753, - "status": "ok", - "timestamp": 1532101593840, - "user": { - "displayName": "", - "photoUrl": "", - "userId": "" - }, - "user_tz": 240 - }, - "id": "hIGYeX0Cxs3i", - "outputId": "e0b62eb1-3e12-4e53-dc54-8a3fa56d823d" + "id": "hIGYeX0Cxs3i" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "from __future__ import print_function\n", - "import tensorflow as tf\n", - "\n", - "def tf__gol_episode(board):\n", - " try:\n", - " with tf.name_scope('gol_episode'):\n", - " directions = tf.constant(((-1, -1), (-1, 0), (-1, 1), (0, -1), (0, 1),\n", - " (1, -1), (1, 0), (1, 1)))\n", - " new_board = ag__.new_list([])\n", - "\n", - " def extra_test_2(new_board_2):\n", - " with tf.name_scope('extra_test_2'):\n", - " return True\n", - "\n", - " def loop_body_2(i, new_board_2):\n", - " with tf.name_scope('loop_body_2'):\n", - "\n", - " def extra_test_1(new_board_1):\n", - " with tf.name_scope('extra_test_1'):\n", - " return True\n", - "\n", - " def loop_body_1(j, new_board_1):\n", - " with tf.name_scope('loop_body_1'):\n", - " num_neighbors = 0\n", - "\n", - " def extra_test(num_neighbors_2):\n", - " with tf.name_scope('extra_test'):\n", - " return True\n", - "\n", - " def loop_body(d, num_neighbors_2):\n", - " with tf.name_scope('loop_body'):\n", - " ni = i + ag__.get_item(d, (0), opts=ag__.GetItemOpts(\n", - " element_dtype=None))\n", - " nj = j + ag__.get_item(d, (1), opts=ag__.GetItemOpts(\n", - " element_dtype=None))\n", - "\n", - " def if_true():\n", - " with tf.name_scope('if_true'):\n", - " num_neighbors_1, = num_neighbors_2,\n", - " num_neighbors_1 += ag__.get_item(ag__.get_item(board,\n", - " (ni), opts=ag__.GetItemOpts(element_dtype=None)),\n", - " (nj), opts=ag__.GetItemOpts(element_dtype=None))\n", - " return num_neighbors_1,\n", - "\n", - " def if_false():\n", - " with tf.name_scope('if_false'):\n", - " return num_neighbors_2,\n", - " num_neighbors_2 = ag__.utils.run_cond(tf.logical_and(tf.\n", - " greater_equal(ni, 0), tf.logical_and(tf.greater_equal\n", - " (nj, 0), tf.logical_and(tf.less(ni, ag__.utils.\n", - " dynamic_builtin(len, board)), tf.less(nj, ag__.utils.\n", - " dynamic_builtin(len, ag__.get_item(board, (i), opts=\n", - " ag__.GetItemOpts(element_dtype=None))))))), if_true,\n", - " if_false)\n", - " return num_neighbors_2,\n", - " num_neighbors = ag__.for_stmt(directions, extra_test,\n", - " loop_body, (num_neighbors,))\n", - " new_cell = 0\n", - "\n", - " def if_true_2():\n", - " with tf.name_scope('if_true_2'):\n", - " new_cell_2, = new_cell,\n", - " new_cell_2 = ag__.get_item(ag__.get_item(board, (i), opts\n", - " =ag__.GetItemOpts(element_dtype=None)), (j), opts=\n", - " ag__.GetItemOpts(element_dtype=None))\n", - " return new_cell_2,\n", - "\n", - " def if_false_2():\n", - " with tf.name_scope('if_false_2'):\n", - " new_cell_3, = new_cell,\n", - "\n", - " def if_true_1():\n", - " with tf.name_scope('if_true_1'):\n", - " new_cell_1, = new_cell_3,\n", - " new_cell_1 = 1\n", - " return new_cell_1,\n", - "\n", - " def if_false_1():\n", - " with tf.name_scope('if_false_1'):\n", - " return new_cell_3,\n", - " new_cell_3 = ag__.utils.run_cond(tf.equal(num_neighbors, \n", - " 3), if_true_1, if_false_1)\n", - " return new_cell_3,\n", - " new_cell = ag__.utils.run_cond(tf.equal(num_neighbors, 2),\n", - " if_true_2, if_false_2)\n", - " new_board_1 = ag__.list_append(new_board_1, new_cell)\n", - " return new_board_1,\n", - " new_board_2 = ag__.for_stmt(ag__.utils.dynamic_builtin(range,\n", - " ag__.utils.dynamic_builtin(len, ag__.get_item(board, (i),\n", - " opts=ag__.GetItemOpts(element_dtype=None)))), extra_test_1,\n", - " loop_body_1, (new_board_2,))\n", - " return new_board_2,\n", - " new_board = ag__.for_stmt(ag__.utils.dynamic_builtin(range, ag__.\n", - " utils.dynamic_builtin(len, board)), extra_test_2, loop_body_2, (\n", - " new_board,))\n", - " final_board = ag__.list_stack(new_board, opts=ag__.ListStackOpts(\n", - " element_dtype=tf.int32, original_call=ag.stack))\n", - " final_board = tf.reshape(final_board, board.shape)\n", - " return final_board\n", - " except:\n", - " ag__.rewrite_graph_construction_error(ag_source_map__)\n", - "\n", - "def tf__gol(initial_board):\n", - " try:\n", - " with tf.name_scope('gol'):\n", - " board = initial_board\n", - " boards = ag__.new_list([])\n", - "\n", - " def extra_test(board_1, boards_1):\n", - " with tf.name_scope('extra_test'):\n", - " return True\n", - "\n", - " def loop_body(i, board_1, boards_1):\n", - " with tf.name_scope('loop_body'):\n", - " board_1 = tf__gol_episode(board_1)\n", - " boards_1 = ag__.list_append(boards_1, board_1)\n", - " return board_1, boards_1\n", - " board, boards = ag__.for_stmt(ag__.utils.dynamic_builtin(range, tf.\n", - " constant(NUM_STEPS)), extra_test, loop_body, (board, boards))\n", - " boards = ag__.list_stack(boards, opts=ag__.ListStackOpts(\n", - " element_dtype=tf.int32, original_call=ag.stack))\n", - " with ag__.utils.control_dependency_on_returns(render(boards)):\n", - " boards_2 = ag__.utils.alias_tensors(boards)\n", - " return tf.no_op()\n", - " except:\n", - " ag__.rewrite_graph_construction_error(ag_source_map__)\n", - "\n" - ] - } - ], + "outputs": [], "source": [ - "print(ag.to_code(gol))" + "print(tf.autograph.to_code(gol.python_function))" ] } ], "metadata": { "colab": { - "collapsed_sections": [ - "p8zZyj-tq4K3", - "Lkq3DBGOv3fA", - "r8_0ioEuAI-a", - "7NgrSPCZxs3h" - ], - "default_view": {}, + "collapsed_sections": [], "last_runtime": { "build_target": "", "kind": "local" @@ -1503,8 +923,11 @@ "timestamp": 1528465909719 } ], - "version": "0.3.2", - "views": {} + "version": "0.3.2" + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" } }, "nbformat": 4, From a9429e942a261948f146f9b4a9fbaeab8598dadc Mon Sep 17 00:00:00 2001 From: Jared Duke Date: Mon, 22 Jul 2019 14:51:30 -0700 Subject: [PATCH 0343/3053] Fix resize_bilinear type propagation This operator supports more than just float32 outputs. PiperOrigin-RevId: 259411764 --- .../toco/graph_transformations/propagate_array_data_types.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc index 0f67edce9b1..360ab3cbd5c 100644 --- a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc +++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc @@ -55,7 +55,6 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op, // Do the actual output data types propagation. switch (op->type) { case OperatorType::kDequantize: - case OperatorType::kResizeBilinear: // These operators unconditionally produce float outputs SetDataTypeForAllOutputs(model, op, ArrayDataType::kFloat); break; From 6b0dba99c3e7f89a6c0a6770a9eefd8bf907ef46 Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Mon, 22 Jul 2019 14:54:46 -0700 Subject: [PATCH 0344/3053] [XLA GPU] Simplify tiling prologue/epilogue calling logic by initializing them to no-op lambdas PiperOrigin-RevId: 259412438 --- .../xla/service/gpu/ir_emitter_unnested.cc | 15 ++------------- .../xla/service/gpu/ir_emitter_unnested.h | 6 ++++-- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index de7fab3304e..51c34371b00 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -3191,20 +3191,9 @@ LaunchDimensions IrEmitterUnnested::EmitKernel( } }; - const BlockPrologueGenerator& block_prologue_generator = - kernel_generator.GetBlockPrologueGenerator(); - if (block_prologue_generator) { - block_prologue_generator(unnested_hlo, kernel_info); - } - + kernel_generator.GetBlockPrologueGenerator()(unnested_hlo, kernel_info); EmitBlock(std::move(emit_one_tile), kernel_info, &ksl, index_ty); - - const BlockEpilogueGenerator& block_epilogue_generator = - kernel_generator.GetBlockEpilogueGenerator(); - if (block_epilogue_generator) { - block_epilogue_generator(unnested_hlo, kernel_info); - } - + kernel_generator.GetBlockEpilogueGenerator()(unnested_hlo, kernel_info); return launch_dimensions; } diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h index e5177c28484..0e3700fc59c 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h @@ -121,8 +121,10 @@ class IrEmitterUnnested : public IrEmitter { public: explicit KernelCodeGenerator( TileElementGenerator tile_element_generator, - BlockPrologueGenerator block_prologue_generator = {}, - BlockEpilogueGenerator block_epilogue_generator = {}) + BlockPrologueGenerator block_prologue_generator = + [](HloInstruction*, KernelCodegenInfo*) {}, + BlockEpilogueGenerator block_epilogue_generator = + [](HloInstruction*, KernelCodegenInfo*) {}) : tile_element_generator_(std::move(tile_element_generator)), block_prologue_generator_(std::move(block_prologue_generator)), block_epilogue_generator_(std::move(block_epilogue_generator)) {} From aca02856dd5ecd8c177bf16993ce2d368ae56d06 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 15:08:27 -0700 Subject: [PATCH 0345/3053] Add missing TfLiteFloat16 specialization to GetTensorData. PiperOrigin-RevId: 259415502 --- tensorflow/lite/kernels/internal/tensor_ctypes.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tensorflow/lite/kernels/internal/tensor_ctypes.h b/tensorflow/lite/kernels/internal/tensor_ctypes.h index 8ee95d4d5b3..e2136dc1549 100644 --- a/tensorflow/lite/kernels/internal/tensor_ctypes.h +++ b/tensorflow/lite/kernels/internal/tensor_ctypes.h @@ -28,6 +28,11 @@ inline float* GetTensorData(TfLiteTensor* tensor) { return tensor != nullptr ? tensor->data.f : nullptr; } +template <> +inline TfLiteFloat16* GetTensorData(TfLiteTensor* tensor) { + return tensor != nullptr ? tensor->data.f16 : nullptr; +} + template <> inline uint8_t* GetTensorData(TfLiteTensor* tensor) { return tensor != nullptr ? tensor->data.uint8 : nullptr; From e271346b5029a20e067bf7a2bd95dc4fd22faef7 Mon Sep 17 00:00:00 2001 From: Sergei Lebedev Date: Mon, 22 Jul 2019 15:13:32 -0700 Subject: [PATCH 0346/3053] Updated scalar caching benchmarks benchmark_add_*_scalars and is forced on GPU if a GPU is available, the result is copied back to CPU; benchmark_create_int32_scalar is only executed on CPU PiperOrigin-RevId: 259416450 --- tensorflow/python/eager/benchmarks_test.py | 25 ++++++++++++---------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py index 9e945ff3dd4..a64c3368f38 100644 --- a/tensorflow/python/eager/benchmarks_test.py +++ b/tensorflow/python/eager/benchmarks_test.py @@ -193,32 +193,35 @@ class MicroBenchmarks(test.Benchmark): def _benchmark_create_constant(self, value, dtype): def func(): - return constant_op.constant(value, dtype=dtype) + constant_op.constant(value, dtype=dtype) - for _ in range(1000): - func() # Warmup. - - self._run(func, 30000) + with ops.device("GPU:0" if context.num_gpus() else "CPU:0"): + for _ in range(1000): + func() # Warmup. + self._run(func, 3000) def benchmark_create_float_constant(self): self._benchmark_create_constant(42.0, dtype=None) def benchmark_create_int32_constant(self): + if context.num_gpus(): + return # int32 constants are always allocated on CPU. + self._benchmark_create_constant(42, dtype=dtypes.int32) def _benchmark_add_scalars(self, a, b): def func(): - return math_ops.add(a, b) + return memoryview(math_ops.add(a, b)) - for _ in range(1000): - func() # Warmup. - - self._run(func, 30000) + with ops.device("GPU:0" if context.num_gpus() else "CPU:0"): + for _ in range(1000): + func() # Warmup. + self._run(func, 30000) def benchmark_add_float_scalars(self): self._benchmark_add_scalars(42.0, 24.0) - def benchmark_add_int_scalars(self): + def benchmark_add_int32_scalars(self): self._benchmark_add_scalars(42, 24) def benchmark_create_float_tensor_from_list_CPU(self): From f81646ab0a5fb9895311436271a9c422683ce17e Mon Sep 17 00:00:00 2001 From: Sundeep Gottipati <42554856+bananabowl@users.noreply.github.com> Date: Mon, 22 Jul 2019 15:31:37 -0700 Subject: [PATCH 0347/3053] Update 1.14 behavioral changes to mention tf.keras.optimizers.Adadelta default learning rate change --- RELEASE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASE.md b/RELEASE.md index 6a4c2d6486d..cc0d3e6aaee 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -43,6 +43,7 @@ * Transitive dependencies on :pooling_ops were removed. Some users may need to add explicit dependencies on :pooling_ops if they reference the operators from that library. +* tf.keras.optimizers.Adadelta default learning rate changed from 1.0 to .001 ## Bug Fixes and Other Changes From 771d4f3b521c7f6f3974432a97c2143a65b7a8a0 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Mon, 22 Jul 2019 15:20:38 -0700 Subject: [PATCH 0348/3053] Remove some unicode characters from gpu/elemental_ir_emitter. The C++ style guide discourages these: https://google.github.io/styleguide/cppguide.html#Non-ASCII_Characters and they don't play well with all text editors. PiperOrigin-RevId: 259417789 --- tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc index c0cd4addc7e..a8dae7d9c80 100644 --- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc +++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc @@ -144,7 +144,7 @@ StatusOr GpuElementalIrEmitter::EmitMathCall( // Binary math functions transform are of type [T] -> T. for (PrimitiveType input_type : input_types) { if (output_type != input_type) { - return Unimplemented("Input type ≠ output type: %s ≠ %s", + return Unimplemented("Input type != output type: %s != %s", PrimitiveType_Name(input_type), PrimitiveType_Name(output_type)); } @@ -408,7 +408,7 @@ llvm_ir::ElementGenerator GpuElementalIrEmitter::MakeElementGenerator( SDiv(input_multi_index[i], index_typed_const(window.dimensions(i).base_dilation())); - // We must check whether 0 ≤ input_multi_index[i] < bound, as + // We must check whether 0 <= input_multi_index[i] < bound, as // otherwise we are in the pad and so can skip the computation. This // comparison is equivalent to the unsigned comparison // input_multi_index[i] < bound, as a negative value wraps to a large From deeeaa05cab50f99f8ec795040eeebefcc280042 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 15:27:51 -0700 Subject: [PATCH 0349/3053] Fix hello_world_test to verify against correct sine results. PiperOrigin-RevId: 259419047 --- .../micro/examples/hello_world/hello_world_test.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tensorflow/lite/experimental/micro/examples/hello_world/hello_world_test.cc b/tensorflow/lite/experimental/micro/examples/hello_world/hello_world_test.cc index 22281e7be2a..8e8cc39b486 100644 --- a/tensorflow/lite/experimental/micro/examples/hello_world/hello_world_test.cc +++ b/tensorflow/lite/experimental/micro/examples/hello_world/hello_world_test.cc @@ -90,24 +90,24 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) { // Obtain the output value from the tensor float value = output->data.f[0]; - // Check that the output value is within 0.000001 of the expected value - TF_LITE_MICRO_EXPECT_NEAR(0.0486171, value, 0.000001); + // Check that the output value is within 0.05 of the expected value + TF_LITE_MICRO_EXPECT_NEAR(0., value, 0.05); // Run inference on several more values and confirm the expected outputs input->data.f[0] = 1.; interpreter.Invoke(); value = output->data.f[0]; - TF_LITE_MICRO_EXPECT_NEAR(0.8071436, value, 0.000001); + TF_LITE_MICRO_EXPECT_NEAR(0.841, value, 0.05); input->data.f[0] = 3.; interpreter.Invoke(); value = output->data.f[0]; - TF_LITE_MICRO_EXPECT_NEAR(0.0964818, value, 0.000001); + TF_LITE_MICRO_EXPECT_NEAR(0.141, value, 0.05); input->data.f[0] = 5.; interpreter.Invoke(); value = output->data.f[0]; - TF_LITE_MICRO_EXPECT_NEAR(-0.9352637, value, 0.000001); + TF_LITE_MICRO_EXPECT_NEAR(-0.959, value, 0.05); } TF_LITE_MICRO_TESTS_END From a25e05476305861192dc48566bb84213aef1b188 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 15:38:46 -0700 Subject: [PATCH 0350/3053] Add parameter to tpu.replicate API to enable automatic outside compilation. PiperOrigin-RevId: 259421074 --- .../tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt | 2 +- tensorflow/core/ops/tpu_replication_ops.cc | 2 ++ tensorflow/python/tpu/tpu.py | 4 ++++ tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt | 2 +- tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt | 2 +- 5 files changed, 9 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt index 40392a6954a..2488716e913 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt @@ -512,7 +512,7 @@ versions { # CHECK-NEXT: %4:2 = "_tf.TPUReplicatedInput"(%3#0) {N = 1 : i64, T = "tfdtype$DT_INT32", device = "", name = "input1"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) # CHECK-NEXT: %5 = "_tf.NoOp"() {device = "", name = "cluster/pivot"} : () -> !_tf.control # CHECK-NEXT: %6 = "_tf.NoOp"(%5) {_tpu_replicate = "cluster", device = "", name = "NoOp"} : (!_tf.control) -> !_tf.control -# CHECK-NEXT: %7 = "_tf.TPUReplicateMetadata"(%5) {_tpu_replicate = "cluster", computation_shape = [], device = "", device_assignment = [], host_compute_core = [], name = "TPUReplicateMetadata", num_cores_per_replica = 1 : i64, num_replicas = 1 : i64, padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", use_tpu = true} : (!_tf.control) -> !_tf.control +# CHECK-NEXT: %7 = "_tf.TPUReplicateMetadata"(%5) {_tpu_replicate = "cluster", allow_soft_placement = false, computation_shape = [], device = "", device_assignment = [], host_compute_core = [], name = "TPUReplicateMetadata", num_cores_per_replica = 1 : i64, num_replicas = 1 : i64, padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", use_tpu = true} : (!_tf.control) -> !_tf.control # CHECK-NEXT: %8:2 = "_tf.TPUCompilationResult"(%7) {_tpu_compilation_status = "cluster", device = "", name = "TPUCompilationResult"} : (!_tf.control) -> (tensor, !_tf.control) # CHECK-NEXT: %9:2 = "_tf.Identity"(%2#0, %7) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "", name = "replicated_input_0"} : (tensor<*xi32>, !_tf.control) -> (tensor<*xi32>, !_tf.control) # CHECK-NEXT: %10:2 = "_tf.Identity"(%4#0, %7) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "", name = "replicated_input_1"} : (tensor<*xi32>, !_tf.control) -> (tensor<*xi32>, !_tf.control) diff --git a/tensorflow/core/ops/tpu_replication_ops.cc b/tensorflow/core/ops/tpu_replication_ops.cc index b7fd2a18e0e..265d989fe23 100644 --- a/tensorflow/core/ops/tpu_replication_ops.cc +++ b/tensorflow/core/ops/tpu_replication_ops.cc @@ -33,6 +33,7 @@ REGISTER_OP("TPUReplicateMetadata") .Attr("host_compute_core: list(string) = []") .Attr("padding_map: list(string) = []") .Attr("step_marker_location: string = \"STEP_MARK_AT_ENTRY\"") + .Attr("allow_soft_placement: bool = false") .SetShapeFn(shape_inference::UnknownShape); REGISTER_OP("TPUReplicatedInput") @@ -103,6 +104,7 @@ REGISTER_OP("_TPUReplicate") .Attr("output_types: list(type) >= 0") .Attr("padding_map: list(string) = []") .Attr("step_marker_location: string = \"STEP_MARK_AT_ENTRY\"") + .Attr("allow_soft_placement: bool = false") .Input("inputs: Tinputs") .Input("broadcast_inputs: Tbroadcast_inputs") .Input("variables: NumVariables * resource") diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py index eeb612edbcd..c9bcf3a2e04 100644 --- a/tensorflow/python/tpu/tpu.py +++ b/tensorflow/python/tpu/tpu.py @@ -26,6 +26,7 @@ from tensorflow.core.framework import attr_value_pb2 from tensorflow.core.protobuf.tpu import dynamic_padding_pb2 as dynamic_padding from tensorflow.python.compat import compat as api_compat from tensorflow.python.compiler.xla import xla +from tensorflow.python.framework import config from tensorflow.python.framework import device as pydev from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors @@ -755,6 +756,9 @@ def split_compile_and_replicate(computation, device_assignment.num_cores_per_replica ] + # This entry is used for enabling automatic outside compilation. + metadata_kwargs["allow_soft_placement"] = config.get_soft_device_placement() + if ((not isinstance(inputs, list)) or any(not isinstance(inp, (list, tuple)) for inp in inputs)): raise TypeError("tpu.replicate() inputs must be a list of lists/tuples") diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt index fac6284ec44..abf0eae4522 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt @@ -4194,7 +4194,7 @@ tf_module { } member_method { name: "TPUReplicateMetadata" - argspec: "args=[\'num_replicas\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'computation_shape\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'None\'], " + argspec: "args=[\'num_replicas\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'computation_shape\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'allow_soft_placement\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'False\', \'None\'], " } member_method { name: "TPUReplicatedInput" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt index fac6284ec44..abf0eae4522 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt @@ -4194,7 +4194,7 @@ tf_module { } member_method { name: "TPUReplicateMetadata" - argspec: "args=[\'num_replicas\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'computation_shape\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'None\'], " + argspec: "args=[\'num_replicas\', \'num_cores_per_replica\', \'topology\', \'use_tpu\', \'device_assignment\', \'computation_shape\', \'host_compute_core\', \'padding_map\', \'step_marker_location\', \'allow_soft_placement\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'\', \'True\', \'[]\', \'[]\', \'[]\', \'[]\', \'STEP_MARK_AT_ENTRY\', \'False\', \'None\'], " } member_method { name: "TPUReplicatedInput" From 4478e969620d3bf9f9ed0fad1ba7fe67b5757f5b Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Mon, 22 Jul 2019 15:51:13 -0700 Subject: [PATCH 0351/3053] Respect the return_same_structure argument when maximum_iterations is supplied. The fix is not complete, and only addresses the common use case of single loop vars. PiperOrigin-RevId: 259423371 --- tensorflow/python/ops/control_flow_ops.py | 5 ++- .../python/ops/control_flow_ops_test.py | 40 +++++++++++++++++++ 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py index 7d3d8d67183..4f719086123 100644 --- a/tensorflow/python/ops/control_flow_ops.py +++ b/tensorflow/python/ops/control_flow_ops.py @@ -2664,6 +2664,7 @@ def while_loop(cond, if parallel_iterations < 1: raise TypeError("parallel_iterations must be a positive integer.") + try_to_pack = (len(loop_vars) == 1 and not return_same_structure) if maximum_iterations is not None: maximum_iterations = ops.convert_to_tensor( maximum_iterations, name="maximum_iterations") @@ -2679,7 +2680,7 @@ def while_loop(cond, 0, dtype=maximum_iterations.dtype, name="iteration_counter") orig_cond = cond orig_body = body - if len(loop_vars) == 1: + if try_to_pack: loop_vars = (counter, loop_vars[0]) cond = lambda i, lv: ( # pylint: disable=g-long-lambda math_ops.logical_and(i < maximum_iterations, orig_cond(lv))) @@ -2689,9 +2690,9 @@ def while_loop(cond, cond = lambda i, lv: ( # pylint: disable=g-long-lambda math_ops.logical_and(i < maximum_iterations, orig_cond(*lv))) body = lambda i, lv: (i + 1, orig_body(*lv)) + try_to_pack = False if executing_eagerly: - try_to_pack = len(loop_vars) == 1 packed = False # whether the body result was packed into a 1-item tuple loop_var_structure = nest.map_structure(type_spec.type_spec_from_value, diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py index 4d07d60d8ee..91ce63a287a 100644 --- a/tensorflow/python/ops/control_flow_ops_test.py +++ b/tensorflow/python/ops/control_flow_ops_test.py @@ -1300,6 +1300,26 @@ class WhileLoopTestCase(test_util.TensorFlowTestCase): r = control_flow_ops.while_loop(c, b, [i, []]) self.assertEqual(self.evaluate(r), 10) + # Adding maximum_iterations should yield the same result. + r = control_flow_ops.while_loop(c, b, [i, []], maximum_iterations=50) + # Note: this result is still incorrect - it should be just 10. + self.assertEqual(self.evaluate(r), [10, []]) + + def testWhileLoopSameReturnShape_FalseSingleLoopVar(self): + i = constant_op.constant(0) + c = lambda i: math_ops.less(i, 10) + + # Body return must be unpacked in this case. + b = lambda i: math_ops.add(i, 1) + + # Should only return the tensor. + r = control_flow_ops.while_loop(c, b, [i]) + self.assertEqual(self.evaluate(r), 10) + + # Adding maximum_iterations should yield the same result. + r = control_flow_ops.while_loop(c, b, [i], maximum_iterations=50) + self.assertEqual(self.evaluate(r), 10) + def testWhileLoopSameReturnShape_True(self): i = constant_op.constant(0) c = lambda i, _: math_ops.less(i, 10) @@ -1311,6 +1331,26 @@ class WhileLoopTestCase(test_util.TensorFlowTestCase): r = control_flow_ops.while_loop(c, b, [i, []], return_same_structure=True) self.assertEqual(self.evaluate(r), [10, []]) + # Adding maximum_iterations should yield the same result. + r = control_flow_ops.while_loop( + c, b, [i, []], return_same_structure=True, maximum_iterations=50) + self.assertEqual(self.evaluate(r), [10, []]) + + def testWhileLoopSameReturnShape_TrueSingleLoopVar(self): + i = constant_op.constant(0) + c = lambda i: math_ops.less(i, 10) + + b = lambda i: [math_ops.add(i, 1)] + + # Should not unpack the single variable + r = control_flow_ops.while_loop(c, b, [i], return_same_structure=True) + self.assertEqual(self.evaluate(r), [10]) + + # Adding maximum_iterations should yield the same result. + r = control_flow_ops.while_loop( + c, b, [i], return_same_structure=True, maximum_iterations=50) + self.assertEqual(self.evaluate(r), [10]) + class AssertTest(test_util.TensorFlowTestCase): From 6fa39bee866e42854ddef471e0945befec9b0624 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 15:58:27 -0700 Subject: [PATCH 0352/3053] Raise error when `distribute` and `run_distributed` are both passed as Keras compile arguments PiperOrigin-RevId: 259424675 --- .../python/keras_backward_compat_test.py | 104 +++++++++++++----- tensorflow/python/keras/engine/training.py | 2 +- 2 files changed, 79 insertions(+), 27 deletions(-) diff --git a/tensorflow/contrib/distribute/python/keras_backward_compat_test.py b/tensorflow/contrib/distribute/python/keras_backward_compat_test.py index c97f93371bf..d6929de07b1 100644 --- a/tensorflow/contrib/distribute/python/keras_backward_compat_test.py +++ b/tensorflow/contrib/distribute/python/keras_backward_compat_test.py @@ -369,7 +369,12 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase, optimizer = gradient_descent.GradientDescentOptimizer(0.001) loss = 'mse' metrics = ['mae'] - model.compile(optimizer, loss, metrics=metrics, distribute=distribution) + model.compile( + optimizer, + loss, + metrics=metrics, + distribute=distribution, + run_distributed=False) inputs = np.zeros((64, 3), dtype=np.float32) targets = np.zeros((64, 4), dtype=np.float32) @@ -399,7 +404,8 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase, optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001) loss = 'mse' - model.compile(optimizer, loss, distribute=distribution) + model.compile( + optimizer, loss, distribute=distribution, run_distributed=False) input_a_np = np.asarray(np.random.random((64, 3)), dtype=np.float32) input_b_np = np.asarray(np.random.random((64, 5)), dtype=np.float32) @@ -432,7 +438,8 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase, model = get_model() optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001) loss = 'mse' - model.compile(optimizer, loss, distribute=distribution) + model.compile( + optimizer, loss, distribute=distribution, run_distributed=False) inputs = np.zeros((20, 3), np.float32) targets = np.zeros((20, 4), np.float32) @@ -448,7 +455,8 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase, optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001) loss = 'mse' - model.compile(optimizer, loss, distribute=distribution) + model.compile( + optimizer, loss, distribute=distribution, run_distributed=False) # We take 6 input samples with each input having a dimension of 3 or 5. input_a_np = np.asarray(np.random.random((6, 3)), dtype=np.float32) @@ -478,7 +486,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase, optimizer = gradient_descent.GradientDescentOptimizer(0.001) loss = 'mse' metrics = ['mae', keras.metrics.CategoricalAccuracy()] - model.compile(optimizer, loss, metrics=metrics, distribute=distribution) + model.compile( + optimizer, + loss, + metrics=metrics, + distribute=distribution, + run_distributed=False) dataset = get_dataset(distribution) @@ -497,7 +510,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase, gradient_descent.GradientDescentOptimizer(0.001), loss='mse', metrics=['mae', keras.metrics.CategoricalAccuracy()], - distribute=distribution) + distribute=distribution, + run_distributed=False) interleaved_model = get_model() interleaved_model.set_weights(user_controlled_model.get_weights()) @@ -505,7 +519,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase, gradient_descent.GradientDescentOptimizer(0.001), loss='mse', metrics=['mae', keras.metrics.CategoricalAccuracy()], - distribute=distribution) + distribute=distribution, + run_distributed=False) dataset = get_dataset(distribution) @@ -546,7 +561,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase, optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001) loss = 'mse' metrics = ['mae', keras.metrics.CategoricalAccuracy()] - model.compile(optimizer, loss, metrics=metrics, distribute=distribution) + model.compile( + optimizer, + loss, + metrics=metrics, + distribute=distribution, + run_distributed=False) input_a_np = np.random.random((10, 3)) input_b_np = np.random.random((10, 5)) @@ -578,7 +598,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase, optimizer = gradient_descent.GradientDescentOptimizer(0.001) loss = 'mse' metrics = ['mae', keras.metrics.CategoricalAccuracy()] - model.compile(optimizer, loss, metrics=metrics, distribute=distribution) + model.compile( + optimizer, + loss, + metrics=metrics, + distribute=distribution, + run_distributed=False) dataset = get_dataset(distribution) @@ -592,7 +617,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase, model = get_model() loss = 'mse' - model.compile(optimizer(), loss, distribute=distribution) + model.compile( + optimizer(), loss, distribute=distribution, run_distributed=False) dataset = get_dataset(distribution) @@ -605,7 +631,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase, model = get_model() optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001) loss = 'mse' - model.compile(optimizer, loss, distribute=distribution) + model.compile( + optimizer, loss, distribute=distribution, run_distributed=False) inputs = np.zeros((10, 3), np.float32) targets = np.zeros((10, 4), np.float32) @@ -633,7 +660,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase, optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001) loss = 'mse' - model.compile(optimizer, loss, distribute=distribution) + model.compile( + optimizer, loss, distribute=distribution, run_distributed=False) # Wrong input shape inputs = np.zeros((10, 5), dtype=np.float32) @@ -660,7 +688,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase, optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001) loss = 'mse' - model.compile(optimizer, loss, distribute=distribution) + model.compile( + optimizer, loss, distribute=distribution, run_distributed=False) # User forgets to batch the dataset inputs = np.zeros((10, 3), dtype=np.float32) @@ -692,7 +721,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase, optimizer = gradient_descent.GradientDescentOptimizer(0.005) loss = 'mse' metrics = ['acc'] - model.compile(optimizer, loss, metrics=metrics, distribute=distribution) + model.compile( + optimizer, + loss, + metrics=metrics, + distribute=distribution, + run_distributed=False) batch_size = 8 if isinstance(distribution, mirrored_strategy.CoreMirroredStrategy): @@ -727,7 +761,8 @@ class TestDistributionStrategyWithDatasets(test.TestCase, optimizer = gradient_descent_keras.SGD(0.01) loss = 'mse' - model.compile(optimizer, loss, distribute=distribution) + model.compile( + optimizer, loss, distribute=distribution, run_distributed=False) dataset = get_dataset(distribution) @@ -761,7 +796,12 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase): optimizer = gradient_descent.GradientDescentOptimizer(0.001) loss = 'mse' metrics = ['mae'] - model.compile(optimizer, loss, metrics=metrics, distribute=distribution) + model.compile( + optimizer, + loss, + metrics=metrics, + distribute=distribution, + run_distributed=False) dataset = get_dataset(distribution) @@ -816,7 +856,12 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase): optimizer = gradient_descent.GradientDescentOptimizer(0.001) loss = 'mse' metrics = ['mae'] - model.compile(optimizer, loss, metrics=metrics, distribute=distribution) + model.compile( + optimizer, + loss, + metrics=metrics, + distribute=distribution, + run_distributed=False) dataset = get_dataset(distribution) @@ -856,9 +901,11 @@ class TestDistributionStrategyWithLossMasking(test.TestCase, model.add( keras.layers.TimeDistributed( keras.layers.Dense(1, kernel_initializer='one'))) - model.compile(loss='mse', - optimizer=gradient_descent.GradientDescentOptimizer(0.01), - distribute=distribution) + model.compile( + loss='mse', + optimizer=gradient_descent.GradientDescentOptimizer(0.01), + distribute=distribution, + run_distributed=False) y = np.array([[[1], [1]], [[1], [1]]]) dataset = dataset_ops.Dataset.from_tensor_slices((x, y)) dataset = dataset.repeat(100) @@ -877,9 +924,11 @@ class TestDistributionStrategyWithNormalizationLayer( model = keras.models.Sequential() norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8) model.add(norm) - model.compile(loss='mse', - optimizer=gradient_descent.GradientDescentOptimizer(0.01), - distribute=distribution) + model.compile( + loss='mse', + optimizer=gradient_descent.GradientDescentOptimizer(0.01), + distribute=distribution, + run_distributed=False) # centered on 5.0, variance 10.0 x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10)) @@ -924,7 +973,8 @@ class TestDistributionStrategyCorrectness(test.TestCase, loss=keras.losses.mean_squared_error, optimizer=gradient_descent.GradientDescentOptimizer(0.5), metrics=[keras.metrics.BinaryAccuracy()], - distribute=distribution) + distribute=distribution, + run_distributed=False) batch_size = 64 if not distributed_training_utils.global_batch_size_supported( @@ -950,7 +1000,8 @@ class TestDistributionStrategyCorrectness(test.TestCase, loss='mae', metrics=['accuracy', keras.metrics.BinaryAccuracy()], optimizer=gradient_descent.GradientDescentOptimizer(0.001), - distribute=distribution) + distribute=distribution, + run_distributed=False) # verify correctness of stateful and stateless metrics. x = np.ones((100, 4)).astype('float32') @@ -1026,7 +1077,8 @@ class TestDistributionStrategyCorrectness(test.TestCase, loss=keras.losses.mean_squared_error, optimizer=gradient_descent_keras.SGD(0.5), metrics=['mse'], - distribute=with_distribution) + distribute=with_distribution, + run_distributed=False) training_inputs, eval_inputs, predict_inputs = ( get_correctness_test_inputs(use_numpy, use_validation_data, diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py index 718f3a582cf..4d8051cdfae 100644 --- a/tensorflow/python/keras/engine/training.py +++ b/tensorflow/python/keras/engine/training.py @@ -251,7 +251,7 @@ class Model(network.Network): self._run_distributed = False if distribute is not None: - if tf2.enabled(): + if tf2.enabled() or self._run_distributed: raise ValueError( 'Distribute argument in compile is not available in TF 2.0 please ' 'create the model under the distribution strategy scope.') From c06c33118f39ac64ae68c739364445c9224a5150 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 16:08:26 -0700 Subject: [PATCH 0353/3053] merge the libcupti stub for xprof/oss. PiperOrigin-RevId: 259426737 --- tensorflow/stream_executor/build_defs.bzl | 3 --- tensorflow/stream_executor/cuda/BUILD | 3 +-- tensorflow/stream_executor/cuda/cupti_stub.cc | 4 ---- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/tensorflow/stream_executor/build_defs.bzl b/tensorflow/stream_executor/build_defs.bzl index 469f5511e99..575ff639e75 100644 --- a/tensorflow/stream_executor/build_defs.bzl +++ b/tensorflow/stream_executor/build_defs.bzl @@ -13,9 +13,6 @@ def tf_additional_cuda_driver_deps(): def tf_additional_cudnn_plugin_deps(): return [] -def tf_additional_cupti_stub_data(): - return ["@local_config_cuda//cuda:cupti_dsos"] - # Returns whether any GPU backend is configuered. def if_gpu_is_configured(x): if cuda_is_configured() or rocm_is_configured(): diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD index 2f3483b485f..eec6195561b 100644 --- a/tensorflow/stream_executor/cuda/BUILD +++ b/tensorflow/stream_executor/cuda/BUILD @@ -8,7 +8,6 @@ load( "tf_additional_cuda_driver_deps", "tf_additional_cuda_platform_deps", "tf_additional_cudnn_plugin_deps", - "tf_additional_cupti_stub_data", ) load("//tensorflow:tensorflow.bzl", "tf_copts") load( @@ -421,7 +420,7 @@ cc_library( cc_library( name = "cupti_stub", srcs = if_cuda_is_configured(["cupti_stub.cc"]), - data = if_cuda_is_configured(tf_additional_cupti_stub_data()), + data = if_cuda_is_configured(["@local_config_cuda//cuda:cupti_dsos"]), textual_hdrs = ["cupti_10_0.inc"], deps = if_cuda_is_configured([ "@local_config_cuda//cuda:cupti_headers", diff --git a/tensorflow/stream_executor/cuda/cupti_stub.cc b/tensorflow/stream_executor/cuda/cupti_stub.cc index 0c7dd2e75f0..130c3f96e44 100644 --- a/tensorflow/stream_executor/cuda/cupti_stub.cc +++ b/tensorflow/stream_executor/cuda/cupti_stub.cc @@ -23,16 +23,12 @@ limitations under the License. namespace { // Returns DSO handle or null if loading the DSO fails. void* GetDsoHandle() { -#ifdef PLATFORM_GOOGLE - return nullptr; -#else static auto handle = []() -> void* { auto handle_or = stream_executor::internal::DsoLoader::GetCuptiDsoHandle(); if (!handle_or.ok()) return nullptr; return handle_or.ValueOrDie(); }(); return handle; -#endif } template From 8ba08e5b2b170142b3ab6e46fcc1a4c3ba24aed9 Mon Sep 17 00:00:00 2001 From: Anjali Sridhar Date: Mon, 22 Jul 2019 16:12:58 -0700 Subject: [PATCH 0354/3053] Update tf.distribute.experimental.CentralStorageStrategy API docs. PiperOrigin-RevId: 259427495 --- .../distribute/central_storage_strategy.py | 209 +++++++++++++++++- 1 file changed, 200 insertions(+), 9 deletions(-) diff --git a/tensorflow/python/distribute/central_storage_strategy.py b/tensorflow/python/distribute/central_storage_strategy.py index caa184c5fa5..63cf21d9674 100644 --- a/tensorflow/python/distribute/central_storage_strategy.py +++ b/tensorflow/python/distribute/central_storage_strategy.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Classes implementing a multi-worker ps DistributionStrategy.""" +"""Class implementing a single machine parameter server strategy.""" from __future__ import absolute_import from __future__ import division @@ -32,12 +32,24 @@ class CentralStorageStrategy(distribute_lib.Strategy): than one GPU, compute operations (other than variable update operations) will be replicated across all GPUs. - Args: - compute_devices: an optional list of strings for device to replicate models - on. If this is not provided, all local GPUs will be used; if there is no - GPU, local CPU will be used. - parameter_device: an optional device string for which device to put - variables on. The default one is CPU or GPU if there is only one. + For Example: + ``` + strategy = tf.distribute.experimental.CentralStorageStrategy() + # Create a dataset + ds = tf.data.Dataset.range(5).batch(2) + # Distribute that dataset + dist_dataset = strategy.experimental_distribute_dataset(ds) + + with strategy.scope(): + @tf.function + def train_step(val): + return val + 1 + + # Iterate over the distributed dataset + for x in dist_dataset: + # process dataset elements + strategy.experimental_run_v2(train_step, args=(x,)) + ``` """ def __init__(self, compute_devices=None, parameter_device=None): @@ -45,22 +57,201 @@ class CentralStorageStrategy(distribute_lib.Strategy): self, compute_devices=compute_devices, parameter_device=parameter_device) + """Initializes the strategy with optional device strings. + + Args: + compute_devices: an optional list of strings for device to replicate models + on. If this is not provided, all local GPUs will be used; if there is no + GPU, local CPU will be used. + parameter_device: an optional device string for which device to put + variables on. The default one is CPU or GPU if there is only one. + """ super(CentralStorageStrategy, self).__init__(extended) @classmethod def _from_num_gpus(cls, num_gpus): return cls(device_util.local_devices_from_num_gpus(num_gpus)) + def experimental_distribute_dataset(self, dataset): # pylint: disable=useless-super-delegation + """Distributes a tf.data.Dataset instance provided via dataset. -@tf_export(v1=["distribute.experimental.CentralStorageStrategy"]) + The returned dataset is a wrapped strategy dataset which creates a + multidevice iterator under the hood. It prefetches the input data to the + specified devices on the worker. The returned distributed dataset can be + iterated over similar to how regular datasets can. + + NOTE: Currently, the user cannot add any more transformations to a + distributed dataset. + + For Example: + ``` + strategy = tf.distribute.CentralStorageStrategy() # with 1 CPU and 1 GPU + dataset = tf.data.Dataset.range(10).batch(2) + dist_dataset = strategy.experimental_distribute_dataset(dataset) + for x in dist_dataset: + print(x) # Prints PerReplica values [0, 1], [2, 3],... + + ``` + Args: + dataset: `tf.data.Dataset` to be prefetched to device. + + Returns: + A "distributed `Dataset`" that the caller can iterate over. + """ + return super(CentralStorageStrategy, self).experimental_distribute_dataset( + dataset) + + def experimental_distribute_datasets_from_function(self, dataset_fn): # pylint: disable=useless-super-delegation + """Distributes `tf.data.Dataset` instances created by calls to `dataset_fn`. + + `dataset_fn` will be called once for each worker in the strategy. In this + case, we only have one worker so `dataset_fn` is called once. Each replica + on this worker will then dequeue a batch of elements from this local + dataset. + + The `dataset_fn` should take an `tf.distribute.InputContext` instance where + information about batching and input replication can be accessed. + + For Example: + ``` + def dataset_fn(input_context): + batch_size = input_context.get_per_replica_batch_size(global_batch_size) + d = tf.data.Dataset.from_tensors([[1.]]).repeat().batch(batch_size) + return d.shard( + input_context.num_input_pipelines, input_context.input_pipeline_id) + + inputs = strategy.experimental_distribute_datasets_from_function(dataset_fn) + + for batch in inputs: + replica_results = strategy.experimental_run_v2(replica_fn, args=(batch,)) + ``` + + IMPORTANT: The `tf.data.Dataset` returned by `dataset_fn` should have a + per-replica batch size, unlike `experimental_distribute_dataset`, which uses + the global batch size. This may be computed using + `input_context.get_per_replica_batch_size`. + + Args: + dataset_fn: A function taking a `tf.distribute.InputContext` instance and + returning a `tf.data.Dataset`. + + Returns: + A "distributed `Dataset`", which the caller can iterate over like regular + datasets. + """ + return super( + CentralStorageStrategy, + self).experimental_distribute_datasets_from_function(dataset_fn) + + def experimental_local_results(self, value): # pylint: disable=useless-super-delegation + """Returns the list of all local per-replica values contained in `value`. + + In `CentralStorageStrategy` there is a single worker so the value returned + will be all the values on that worker. + + Args: + value: A value returned by `experimental_run()`, `experimental_run_v2()`, + `extended.call_for_each_replica()`, or a variable created in `scope`. + + Returns: + A tuple of values contained in `value`. If `value` represents a single + value, this returns `(value,).` + """ + return super(CentralStorageStrategy, self).experimental_local_results(value) + + def experimental_run_v2(self, fn, args=(), kwargs=None): # pylint: disable=useless-super-delegation + """Run `fn` on each replica, with the given arguments. + + In `CentralStorageStrategy`, `fn` is called on each of the compute + replicas, with the provided "per replica" arguments specific to that device. + + Args: + fn: The function to run. The output must be a `tf.nest` of `Tensor`s. + args: (Optional) Positional arguments to `fn`. + kwargs: (Optional) Keyword arguments to `fn`. + + Returns: + Return value from running `fn`. + """ + return super(CentralStorageStrategy, self).experimental_run_v2(fn, args, + kwargs) + + def reduce(self, reduce_op, value, axis): # pylint: disable=useless-super-delegation + """Reduce `value` across replicas. + + Given a per-replica value returned by `experimental_run_v2`, say a + per-example loss, the batch will be divided across all the replicas. This + function allows you to aggregate across replicas and optionally also across + batch elements. For example, if you have a global batch size of 8 and 2 + replicas, values for examples `[0, 1, 2, 3]` will be on replica 0 and + `[4, 5, 6, 7]` will be on replica 1. By default, `reduce` will just + aggregate across replicas, returning `[0+4, 1+5, 2+6, 3+7]`. This is useful + when each replica is computing a scalar or some other value that doesn't + have a "batch" dimension (like a gradient). More often you will want to + aggregate across the global batch, which you can get by specifying the batch + dimension as the `axis`, typically `axis=0`. In this case it would return a + scalar `0+1+2+3+4+5+6+7`. + + If there is a last partial batch, you will need to specify an axis so + that the resulting shape is consistent across replicas. So if the last + batch has size 6 and it is divided into [0, 1, 2, 3] and [4, 5], you + would get a shape mismatch unless you specify `axis=0`. If you specify + `tf.distribute.ReduceOp.MEAN`, using `axis=0` will use the correct + denominator of 6. Contrast this with computing `reduce_mean` to get a + scalar value on each replica and this function to average those means, + which will weigh some values `1/8` and others `1/4`. + + For Example: + ``` + strategy = tf.distribute.experimental.CentralStorageStrategy( + compute_devices=['CPU:0', 'GPU:0'], parameter_device='CPU:0') + ds = tf.data.Dataset.range(10) + # Distribute that dataset + dist_dataset = strategy.experimental_distribute_dataset(ds) + + with strategy.scope(): + @tf.function + def train_step(val): + # pass through + return val + + # Iterate over the distributed dataset + for x in dist_dataset: + result = strategy.experimental_run_v2(train_step, args=(x,)) + + result = strategy.reduce(tf.distribute.ReduceOp.SUM, result, + axis=None).numpy() + # result: array([ 4, 6, 8, 10]) + + result = strategy.reduce(tf.distribute.ReduceOp.SUM, result, axis=0).numpy() + # result: 28 + ``` + + Args: + reduce_op: A `tf.distribute.ReduceOp` value specifying how values should + be combined. + value: A "per replica" value, e.g. returned by `experimental_run_v2` to + be combined into a single tensor. + axis: Specifies the dimension to reduce along within each + replica's tensor. Should typically be set to the batch dimension, or + `None` to only reduce across replicas (e.g. if the tensor has no batch + dimension). + + Returns: + A `Tensor`. + """ + return super(CentralStorageStrategy, self).reduce(reduce_op, value, axis) + + +@tf_export(v1=["distribute.experimental.CentralStorageStrategy"]) # pylint: disable=missing-docstring class CentralStorageStrategyV1(distribute_lib.StrategyV1): __doc__ = CentralStorageStrategy.__doc__ def __init__(self, compute_devices=None, parameter_device=None): - """Initializes this strategy with default TFConfigClusterResolver.""" super(CentralStorageStrategyV1, self).__init__( parameter_server_strategy.ParameterServerStrategyExtended( self, compute_devices=compute_devices, parameter_device=parameter_device)) + __init__.__doc__ = CentralStorageStrategy.__init__.__doc__ From 920feb7b149060df34473b10c290e0807b4c5f55 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Mon, 22 Jul 2019 16:29:00 -0700 Subject: [PATCH 0355/3053] Add simple protobuf equality tester for type to shape test. This was omitted in the initial export. PiperOrigin-RevId: 259430319 --- tensorflow/compiler/mlir/xla/BUILD | 2 +- tensorflow/compiler/mlir/xla/type_to_shape.cc | 3 +- .../compiler/mlir/xla/type_to_shape_test.cc | 30 ++++++++++++++++++- 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD index fe4d7e3019d..3fce624d71a 100644 --- a/tensorflow/compiler/mlir/xla/BUILD +++ b/tensorflow/compiler/mlir/xla/BUILD @@ -175,7 +175,6 @@ cc_library( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", - "@com_google_absl//absl/base:core_headers", "@local_config_mlir//:IR", "@local_config_mlir//:Support", ], @@ -189,6 +188,7 @@ tf_cc_test( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:xla_data_proto", + "//tensorflow/core:lib", "//tensorflow/core:test_main", "@local_config_mlir//:IR", ], diff --git a/tensorflow/compiler/mlir/xla/type_to_shape.cc b/tensorflow/compiler/mlir/xla/type_to_shape.cc index 40c896fef9c..e64182889cb 100644 --- a/tensorflow/compiler/mlir/xla/type_to_shape.cc +++ b/tensorflow/compiler/mlir/xla/type_to_shape.cc @@ -17,7 +17,6 @@ limitations under the License. #include -#include "absl/base/integral_types.h" #include "mlir/IR/AffineMap.h" // TF:local_config_mlir #include "mlir/IR/Diagnostics.h" // TF:local_config_mlir #include "mlir/IR/Location.h" // TF:local_config_mlir @@ -25,11 +24,13 @@ limitations under the License. #include "mlir/Support/DebugStringHelper.h" // TF:local_config_mlir #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/types.h" using mlir::IntegerType; using mlir::MemRefType; using mlir::RankedTensorType; using mlir::VectorType; +using tensorflow::int64; using xla::PrimitiveType; using xla::ShapeUtil; diff --git a/tensorflow/compiler/mlir/xla/type_to_shape_test.cc b/tensorflow/compiler/mlir/xla/type_to_shape_test.cc index 9a77be947d5..57922fe1532 100644 --- a/tensorflow/compiler/mlir/xla/type_to_shape_test.cc +++ b/tensorflow/compiler/mlir/xla/type_to_shape_test.cc @@ -15,20 +15,48 @@ limitations under the License. #include "tensorflow/compiler/mlir/xla/type_to_shape.h" +#include + #include "mlir/IR/Builders.h" // TF:local_config_mlir #include "mlir/IR/MLIRContext.h" // TF:local_config_mlir #include "mlir/IR/StandardTypes.h" // TF:local_config_mlir #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/core/platform/protobuf.h" using mlir::Builder; using mlir::MLIRContext; -using ::testing::EqualsProto; namespace xla { namespace { +// Simple implementation of a proto matcher comparing string representations. +// Only works as ShapeProto's textual representation is deterministic. +class ProtoStringMatcher { + public: + explicit ProtoStringMatcher(const tensorflow::protobuf::Message& expected) + : expected_(expected.SerializeAsString()) {} + + template + bool MatchAndExplain(const Message& p, testing::MatchResultListener*) const { + return p.SerializeAsString() == expected_; + } + + void DescribeTo(::std::ostream* os) const { *os << expected_; } + void DescribeNegationTo(::std::ostream* os) const { + *os << "not equal to expected message: " << expected_; + } + + private: + const std::string expected_; +}; + +inline ::testing::PolymorphicMatcher EqualsProto( + const tensorflow::protobuf::Message& x) { + return ::testing::MakePolymorphicMatcher(ProtoStringMatcher(x)); +} + TEST(TypeToShapeTest, ConvertPrimitiveTypes) { MLIRContext context; Builder b(&context); From d9bb0a0acb2811eabc15d8aaa4a61b85ebb1b3b8 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 16:33:34 -0700 Subject: [PATCH 0356/3053] Metal: max unpooling operation test added PiperOrigin-RevId: 259431161 --- .../lite/delegates/gpu/metal/kernels/BUILD | 22 +++++ .../gpu/metal/kernels/max_unpooling_test.mm | 81 +++++++++++++++++++ 2 files changed, 103 insertions(+) create mode 100644 tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling_test.mm diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD index 4df787c80dc..ffa4b7fa1d6 100644 --- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD +++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD @@ -219,6 +219,28 @@ cc_library( ], ) +objc_library( + name = "max_unpooling_test_lib", + testonly = 1, + srcs = ["max_unpooling_test.mm"], + sdk_frameworks = ["XCTest"], + deps = [ + ":max_unpooling", + ":test_util", + ], +) + +ios_unit_test( + name = "max_unpooling_test", + testonly = 1, + minimum_os_version = "9.0", + tags = [ + "notap", + "tflite_not_portable_android", + ], + deps = [":max_unpooling_test_lib"], +) + cc_library( name = "mul", srcs = ["mul.cc"], diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling_test.mm new file mode 100644 index 00000000000..a7231295183 --- /dev/null +++ b/tensorflow/lite/delegates/gpu/metal/kernels/max_unpooling_test.mm @@ -0,0 +1,81 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/delegates/gpu/metal/kernels/add.h" + +#import + +#include + +#include "tensorflow/lite/delegates/gpu/common/operations.h" +#include "tensorflow/lite/delegates/gpu/common/shape.h" +#include "tensorflow/lite/delegates/gpu/common/status.h" +#include "tensorflow/lite/delegates/gpu/common/tensor.h" +#include "tensorflow/lite/delegates/gpu/common/util.h" +#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h" +#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h" +#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h" + +using ::tflite::gpu::MaxUnpooling2DAttributes; +using ::tflite::gpu::BHWC; +using ::tflite::gpu::DataType; +using ::tflite::gpu::HW; +using ::tflite::gpu::metal::CompareVectors; +using ::tflite::gpu::metal::SingleOpModel; +using ::tflite::gpu::TensorRef; +using ::tflite::gpu::OperationType; + +@interface MaxUnpoolingTest : XCTestCase +@end + +@implementation MaxUnpoolingTest +- (void)setUp { + [super setUp]; +} + +- (void)testKernel2x2Stride2x2 { + TensorRef input; + input.type = DataType::FLOAT32; + input.ref = 0; + input.shape = BHWC(1, 2, 2, 1); + + TensorRef indices; + indices.type = DataType::INT32; + indices.ref = 1; + indices.shape = BHWC(1, 2, 2, 1); + + TensorRef output; + output.type = DataType::FLOAT32; + output.ref = 2; + output.shape = BHWC(1, 4, 4, 1); + + MaxUnpooling2DAttributes attr; + attr.kernel = HW(2, 2); + attr.padding.prepended = HW(0, 0); + attr.padding.appended = HW(0, 0); + attr.strides = HW(2, 2); + + SingleOpModel model({ToString(OperationType::MAX_UNPOOLING_2D), attr}, {input, indices}, + {output}); + XCTAssertTrue(model.PopulateTensor(0, {1, 2, 3, 4})); + XCTAssertTrue(model.PopulateTensor(1, {0, 0, 0, 0})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = + CompareVectors({1, 0, 2, 0, 0, 0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 0}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +@end From 99834c088169127a2d831ae81a811900196e9c1e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 16:51:29 -0700 Subject: [PATCH 0357/3053] Metal: Fully connected operation test added PiperOrigin-RevId: 259434462 --- .../lite/delegates/gpu/metal/kernels/BUILD | 22 +++++ .../gpu/metal/kernels/fully_connected_test.mm | 84 +++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 tensorflow/lite/delegates/gpu/metal/kernels/fully_connected_test.mm diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD index ffa4b7fa1d6..03a27858ecf 100644 --- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD +++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD @@ -192,6 +192,28 @@ cc_library( ], ) +objc_library( + name = "fully_connected_test_lib", + testonly = 1, + srcs = ["fully_connected_test.mm"], + sdk_frameworks = ["XCTest"], + deps = [ + ":fully_connected", + ":test_util", + ], +) + +ios_unit_test( + name = "fully_connected_test", + testonly = 1, + minimum_os_version = "9.0", + tags = [ + "notap", + "tflite_not_portable_android", + ], + deps = [":fully_connected_test_lib"], +) + cc_library( name = "hard_swish", srcs = ["hard_swish.cc"], diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected_test.mm new file mode 100644 index 00000000000..8f67ef489b6 --- /dev/null +++ b/tensorflow/lite/delegates/gpu/metal/kernels/fully_connected_test.mm @@ -0,0 +1,84 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/delegates/gpu/metal/kernels/add.h" + +#import + +#include + +#include "tensorflow/lite/delegates/gpu/common/operations.h" +#include "tensorflow/lite/delegates/gpu/common/shape.h" +#include "tensorflow/lite/delegates/gpu/common/status.h" +#include "tensorflow/lite/delegates/gpu/common/tensor.h" +#include "tensorflow/lite/delegates/gpu/common/util.h" +#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h" +#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h" +#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h" + +using ::tflite::gpu::FullyConnectedAttributes; +using ::tflite::gpu::BHWC; +using ::tflite::gpu::DataType; +using ::tflite::gpu::Linear; +using ::tflite::gpu::metal::CompareVectors; +using ::tflite::gpu::metal::SingleOpModel; +using ::tflite::gpu::Tensor; +using ::tflite::gpu::TensorRef; +using ::tflite::gpu::OHWI; +using ::tflite::gpu::OperationType; + +@interface FullyConnectedTest : XCTestCase +@end + +@implementation FullyConnectedTest +- (void)setUp { + [super setUp]; +} + +- (void)testMatrixByVectorMultiplication { + TensorRef input; + input.type = DataType::FLOAT32; + input.ref = 0; + input.shape = BHWC(1, 1, 1, 2); + + FullyConnectedAttributes attr; + + Tensor bias; + bias.shape.v = 4; + bias.id = 1; + bias.data = {1, 2, 3, 4}; + attr.bias = std::move(bias); + + Tensor weights; + weights.shape = OHWI(4, 1, 1, 2); + weights.id = 2; + weights.data = {1, 2, 3, 4, 5, 6, 7, 8}; + attr.weights = std::move(weights); + + TensorRef output; + output.type = DataType::FLOAT32; + output.ref = 2; + output.shape = BHWC(1, 1, 1, 4); + + SingleOpModel model({ToString(OperationType::FULLY_CONNECTED), attr}, {input}, + {output}); + XCTAssertTrue(model.PopulateTensor(0, {1, 2})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({6, 13, 20, 27}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +@end From a81cf97b71158e04a66ac19d8d77fcb00188af4d Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Mon, 22 Jul 2019 16:58:53 -0700 Subject: [PATCH 0358/3053] Update component owners PiperOrigin-RevId: 259435724 --- CODEOWNERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CODEOWNERS b/CODEOWNERS index 2828cf3baf8..f4984403c21 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -6,6 +6,7 @@ /tensorflow/core/nccl/ @azaks2 @chsigg /tensorflow/core/platform/windows/ @mrry /tensorflow/core/platform/s3 @yongtang +/tensorflow/python/autograph/ @mdanatg @kkimdev /tensorflow/python/debug @caisq /tensorflow/python/eager @jaingurav @alextp /tensorflow/python/tools/api/generator/ @annarev @@ -15,6 +16,7 @@ # contrib # NEED OWNER: /tensorflow/contrib/all_reduce +/tensorflow/contrib/autograph/ @mdanatg @kkimdev /tensorflow/contrib/batching/ @alextp @chrisolston /tensorflow/contrib/bayesflow/ @ebrevdo @rsepassi @jvdillon /tensorflow/contrib/boosted_trees/ @sshrdp @yk5 @nataliaponomareva From 63fec203285cb7484941187e606469dffc690607 Mon Sep 17 00:00:00 2001 From: Pavithra Vijay Date: Mon, 22 Jul 2019 17:14:51 -0700 Subject: [PATCH 0359/3053] Raise ValueError if an integer is passed to the training APIs. Currently an AttributeError will be raised when x.shape is invoked in this function. In the single execution path we raise a ValueError for this. With this fix the mismatch of error types between single execution path and otherwise will be fixed. In future however, we will need to raise specific errors from Data Adapter instead of 'Failed to find data adapter that can handle ...' PiperOrigin-RevId: 259438650 --- tensorflow/python/keras/engine/training_test.py | 4 ++-- tensorflow/python/keras/engine/training_utils.py | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py index aeec0264b92..874de6baace 100644 --- a/tensorflow/python/keras/engine/training_test.py +++ b/tensorflow/python/keras/engine/training_test.py @@ -438,7 +438,7 @@ class TrainingTest(keras_parameterized.TestCase): with self.assertRaises(ValueError): model.train_on_batch({'input_a': input_a_np}, [output_d_np, output_e_np]) - with self.assertRaises(AttributeError): + with self.assertRaises(ValueError): model.fit( [input_a_np, input_b_np], [output_d_np, output_e_np], epochs=1, @@ -446,7 +446,7 @@ class TrainingTest(keras_parameterized.TestCase): verbose=0) with self.assertRaises(ValueError): model.train_on_batch([input_a_np], [output_d_np, output_e_np]) - with self.assertRaises(AttributeError): + with self.assertRaises(ValueError): model.train_on_batch(1, [output_d_np, output_e_np]) with self.assertRaises(ValueError): model.train_on_batch(input_a_np, [output_d_np, output_e_np]) diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py index a652807b5ce..f4c2b2613c1 100644 --- a/tensorflow/python/keras/engine/training_utils.py +++ b/tensorflow/python/keras/engine/training_utils.py @@ -435,6 +435,10 @@ def standardize_single_array(x, expected_shape=None): if composite_tensor_utils.is_composite_or_composite_value(x): return x + if isinstance(x, int): + raise ValueError( + 'Expected an array data type but received an integer: {}'.format(x)) + if (x.shape is not None and len(x.shape) == 1 and (expected_shape is None or len(expected_shape) != 1)): if tensor_util.is_tensor(x): From a7f5e36c2ad5a998e312a356bc85039fa7c575ad Mon Sep 17 00:00:00 2001 From: Zongwei Zhou Date: Mon, 22 Jul 2019 17:15:27 -0700 Subject: [PATCH 0360/3053] Remove redundant model.trainable_weight in Keras training_eager PiperOrigin-RevId: 259438753 --- tensorflow/python/keras/engine/training_eager.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py index c019238f48e..2619af0adc2 100644 --- a/tensorflow/python/keras/engine/training_eager.py +++ b/tensorflow/python/keras/engine/training_eager.py @@ -243,15 +243,16 @@ def _process_single_batch(model, else: scaled_total_loss = total_loss if training: - if not model.trainable_weights: + trainable_weights = model.trainable_weights + if trainable_weights: + grads = tape.gradient(scaled_total_loss, trainable_weights) + if isinstance(model.optimizer, loss_scale_optimizer.LossScaleOptimizer): + grads = model.optimizer.get_unscaled_gradients(grads) + model.optimizer.apply_gradients(zip(grads, trainable_weights)) + else: logging.warning('The list of trainable weights is empty. Make sure that' ' you are not setting model.trainable to False before ' 'compiling the model.') - else: - grads = tape.gradient(scaled_total_loss, model.trainable_weights) - if isinstance(model.optimizer, loss_scale_optimizer.LossScaleOptimizer): - grads = model.optimizer.get_unscaled_gradients(grads) - model.optimizer.apply_gradients(zip(grads, model.trainable_weights)) model._set_trainable_state(current_trainable_state) return outs, total_loss, output_losses, masks From f9a29b476656db2e03f6fdc3504e4df4b5e43994 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 17:21:17 -0700 Subject: [PATCH 0361/3053] Metal: element wise operations tests added PiperOrigin-RevId: 259439620 --- .../lite/delegates/gpu/metal/kernels/BUILD | 22 ++ .../gpu/metal/kernels/elementwise_test.mm | 199 ++++++++++++++++++ 2 files changed, 221 insertions(+) create mode 100644 tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD index 03a27858ecf..f1ce542dad7 100644 --- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD +++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD @@ -175,6 +175,28 @@ cc_library( ], ) +objc_library( + name = "elementwise_test_lib", + testonly = 1, + srcs = ["elementwise_test.mm"], + sdk_frameworks = ["XCTest"], + deps = [ + ":elementwise", + ":test_util", + ], +) + +ios_unit_test( + name = "elementwise_test", + testonly = 1, + minimum_os_version = "9.0", + tags = [ + "notap", + "tflite_not_portable_android", + ], + deps = [":elementwise_test_lib"], +) + cc_library( name = "fully_connected", srcs = ["fully_connected.cc"], diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm new file mode 100644 index 00000000000..e2e4c5b7e0f --- /dev/null +++ b/tensorflow/lite/delegates/gpu/metal/kernels/elementwise_test.mm @@ -0,0 +1,199 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/delegates/gpu/metal/kernels/add.h" + +#import + +#include + +#include "tensorflow/lite/delegates/gpu/common/operations.h" +#include "tensorflow/lite/delegates/gpu/common/shape.h" +#include "tensorflow/lite/delegates/gpu/common/status.h" +#include "tensorflow/lite/delegates/gpu/common/tensor.h" +#include "tensorflow/lite/delegates/gpu/common/util.h" +#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h" +#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h" +#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h" + +using ::tflite::gpu::DataType; +using ::tflite::gpu::BHWC; +using ::tflite::gpu::metal::CompareVectors; +using ::tflite::gpu::metal::SingleOpModel; +using ::tflite::gpu::OperationType; +using ::tflite::gpu::TensorRef; + +@interface ElementwiseTest : XCTestCase +@end + +@implementation ElementwiseTest +- (void)setUp { + [super setUp]; +} + +TensorRef GetTensorRef(int ref, const BHWC& shape) { + TensorRef tensor_ref; + tensor_ref.type = DataType::FLOAT32; + tensor_ref.ref = ref; + tensor_ref.shape = shape; + return tensor_ref; +} + +- (void)testAbs { + OperationType op_type = OperationType::ABS; + const BHWC shape(1, 2, 2, 1); + SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}}, + /*inputs=*/{GetTensorRef(0, shape)}, + /*outputs=*/{GetTensorRef(1, shape)}); + XCTAssertTrue(model.PopulateTensor(0, {0.0, -6.2, 2.0, 4.0})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({0.0, 6.2, 2.0, 4.0}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testCos { + OperationType op_type = OperationType::COS; + const BHWC shape(1, 2, 2, 1); + SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}}, + /*inputs=*/{GetTensorRef(0, shape)}, + /*outputs=*/{GetTensorRef(1, shape)}); + XCTAssertTrue(model.PopulateTensor(0, {0.0, 3.1415926, -3.1415926, 1})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({1.0, -1.0, -1.0, 0.540302}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testHardSwish { + OperationType op_type = OperationType::HARD_SWISH; + const BHWC shape(1, 1, 1, 7); + SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}}, + /*inputs=*/{GetTensorRef(0, shape)}, + /*outputs=*/{GetTensorRef(1, shape)}); + XCTAssertTrue(model.PopulateTensor(0, {-4.5f, -3.0f, -1.5f, 0.0f, 1.5f, 3.0f, 4.5f})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = + CompareVectors({0.0f, 0.0f, -0.375f, 0.0f, 1.125f, 3.f, 4.5f}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testLog { + OperationType op_type = OperationType::LOG; + const BHWC shape(1, 2, 2, 1); + SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}}, + /*inputs=*/{GetTensorRef(0, shape)}, + /*outputs=*/{GetTensorRef(1, shape)}); + XCTAssertTrue(model.PopulateTensor(0, {1.0, 3.1415926, 1.0, 1.0})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({0.0, 1.14473, 0.0, 0.0}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testRsqrt { + OperationType op_type = OperationType::RSQRT; + const BHWC shape(1, 2, 2, 1); + SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}}, + /*inputs=*/{GetTensorRef(0, shape)}, + /*outputs=*/{GetTensorRef(1, shape)}); + XCTAssertTrue(model.PopulateTensor(0, {1.0, 2.0, 4.0, 9.0})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({1.0, 0.707106, 0.5, 0.333333}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testSigmoid { + OperationType op_type = OperationType::SIGMOID; + const BHWC shape(1, 2, 2, 1); + SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}}, + /*inputs=*/{GetTensorRef(0, shape)}, + /*outputs=*/{GetTensorRef(1, shape)}); + XCTAssertTrue(model.PopulateTensor(0, {0.0, -6.0, 2.0, 4.0})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({0.5, 0.002473, 0.880797, 0.982014}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testSin { + OperationType op_type = OperationType::SIN; + const BHWC shape(1, 2, 2, 1); + SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}}, + /*inputs=*/{GetTensorRef(0, shape)}, + /*outputs=*/{GetTensorRef(1, shape)}); + XCTAssertTrue(model.PopulateTensor(0, {0.0, 3.1415926, -3.1415926, 1.0})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({0.0, 0.0, 0.0, 0.841471}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testSqrt { + OperationType op_type = OperationType::SQRT; + const BHWC shape(1, 2, 2, 1); + SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}}, + /*inputs=*/{GetTensorRef(0, shape)}, + /*outputs=*/{GetTensorRef(1, shape)}); + XCTAssertTrue(model.PopulateTensor(0, {0.0, 1.0, 2.0, 4.0})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({0.0, 1.0, 1.414213, 2.0}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testSquare { + OperationType op_type = OperationType::SQUARE; + const BHWC shape(1, 2, 2, 1); + SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}}, + /*inputs=*/{GetTensorRef(0, shape)}, + /*outputs=*/{GetTensorRef(1, shape)}); + XCTAssertTrue(model.PopulateTensor(0, {1.0, 2.0, 0.5, -3.0})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({1.0, 4.0, 0.25, 9.0}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testSub { + OperationType op_type = OperationType::SUB; + const BHWC shape(1, 2, 2, 1); + SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}}, + /*inputs=*/{GetTensorRef(0, shape), GetTensorRef(1, shape)}, + /*outputs=*/{GetTensorRef(2, shape)}); + XCTAssertTrue(model.PopulateTensor(0, {0.0, -6.2, 2.0, 4.0})); + XCTAssertTrue(model.PopulateTensor(1, {1.0, 2.0, 3.0, 4.0})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({-1.0, -8.2, -1.0, 0.0}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testTanh { + OperationType op_type = OperationType::TANH; + const BHWC shape(1, 2, 2, 1); + SingleOpModel model({/*type=*/ToString(op_type), /*attributes=*/{}}, + /*inputs=*/{GetTensorRef(0, shape)}, + /*outputs=*/{GetTensorRef(1, shape)}); + XCTAssertTrue(model.PopulateTensor(0, {0.0, -6.0, 2.0, 4.0})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({0.0, -0.999987, 0.964027, 0.999329}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +@end From e8a11738526c86258bbef43c111e96221145be48 Mon Sep 17 00:00:00 2001 From: Igor Ganichev Date: Mon, 22 Jul 2019 17:24:13 -0700 Subject: [PATCH 0362/3053] Make EagerTensor reference Context The only current use of this reference is to ensure that Python deletes eager Context after deleting all tensors using it. For tensors created from Python (by calling EagerTensor constructor), this CL passes the whole Python Context object instead of just the pointer to TFE_Context. For tensors created from C++ (via EagerTensorFromHandle), this CL retrieves the Context by calling the Python's context() method. I tried passing the Context around to instead of retrieving it from Python, but it required a fair amont of extra and mostly useless plumbing. PiperOrigin-RevId: 259440004 --- tensorflow/python/eager/benchmarks_test.py | 5 +-- tensorflow/python/eager/context.py | 23 ++++++++--- tensorflow/python/eager/context_test.py | 37 ++++++++++++++++++ tensorflow/python/eager/ops_test.py | 14 +++++++ tensorflow/python/eager/pywrap_tensor.cc | 39 +++++++++++++++++-- tensorflow/python/eager/pywrap_tfe.h | 16 ++++++++ tensorflow/python/eager/pywrap_tfe_src.cc | 28 ++++++++++++++ tensorflow/python/eager/tensor_test.py | 16 ++++---- tensorflow/python/framework/constant_op.py | 7 ++-- tensorflow/python/framework/ops.py | 45 +++++++++++----------- tensorflow/python/pywrap_tfe.i | 1 + 11 files changed, 186 insertions(+), 45 deletions(-) diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py index a64c3368f38..7113144d237 100644 --- a/tensorflow/python/eager/benchmarks_test.py +++ b/tensorflow/python/eager/benchmarks_test.py @@ -181,13 +181,12 @@ class MicroBenchmarks(test.Benchmark): def _benchmark_create_tensor(self, value, dtype, device): """Benchmark overheads of creating a Tensor object.""" ctx = context.context() - handle = ctx._handle if device == GPU: # Warmup the GPU - ops.EagerTensor(value, context=handle, device=device) + ops.EagerTensor(value, context=ctx, device=device) def func(): - ops.EagerTensor(value, context=handle, device=device, dtype=dtype) + ops.EagerTensor(value, context=ctx, device=device, dtype=dtype) self._run(func, 30000) diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py index fb6d9428be8..245228d4075 100644 --- a/tensorflow/python/eager/context.py +++ b/tensorflow/python/eager/context.py @@ -1470,9 +1470,6 @@ class Context(object): def end_step(self): pywrap_tensorflow.TFE_ContextEndStep(self._handle) -_context = None -_context_lock = threading.Lock() - class _EagerDeviceContext(object): """Context-manager forcing placement of ops and Tensors on a device.""" @@ -1526,11 +1523,27 @@ class _EagerDeviceContext(object): ctx._set_device(old_device_name, old_device_spec) # pylint: disable=protected-access -def _create_context(): +# Do not set directly. Use _set_context. +_context = None +_context_lock = threading.Lock() + + +def _set_context_locked(ctx): global _context + pywrap_tensorflow.TFE_Py_SetEagerContext(ctx) + _context = ctx + + +def _set_context(ctx): + with _context_lock: + _set_context_locked(ctx) + + +def _create_context(): with _context_lock: if _context is None: - _context = Context() + ctx = Context() + _set_context_locked(ctx) def context(): diff --git a/tensorflow/python/eager/context_test.py b/tensorflow/python/eager/context_test.py index ba856b803fa..3b1a3c27622 100644 --- a/tensorflow/python/eager/context_test.py +++ b/tensorflow/python/eager/context_test.py @@ -17,9 +17,12 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import weakref + import numpy as np from tensorflow.python.eager import context +from tensorflow.python.framework import constant_op from tensorflow.python.framework import ops from tensorflow.python.platform import test @@ -34,6 +37,40 @@ class ContextTest(test.TestCase): c._set_global_seed(np.array(123, dtype=t)) c._set_global_seed(ops.convert_to_tensor(123, dtype=t)) + def testContextIsDestroyedAfterTensors(self): + # Create a new context + new_context = context.Context() + weak_c = weakref.ref(new_context) + new_context.ensure_initialized() + + # Create a tensor with the new context as default. + # Make sure to restore the original context. + original_context = context.context() + try: + context._set_context(new_context) + # Use a 2D tensor so that it is not cached. + tensor1 = constant_op.constant([[3.]]) + # Produce a tensor as an operation output. This uses a different code path + # from tensors created from Python. + tensor2 = tensor1 * tensor1 + context._set_context(original_context) + except: + context._set_context(original_context) + raise + + # Deleting our context reference should not delete the underlying object. + del new_context + self.assertIsNot(weak_c(), None) + + # Deleting the first tensor should not delete the context since there is + # another tensor. + del tensor1 + self.assertIsNot(weak_c(), None) + + # Deleting the last tensor should result in deleting its context. + del tensor2 + self.assertIs(weak_c(), None) + if __name__ == '__main__': ops.enable_eager_execution() diff --git a/tensorflow/python/eager/ops_test.py b/tensorflow/python/eager/ops_test.py index 678aa589c74..0a3eb2fdc46 100644 --- a/tensorflow/python/eager/ops_test.py +++ b/tensorflow/python/eager/ops_test.py @@ -17,6 +17,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import gc import threading import weakref @@ -422,6 +423,19 @@ class OpsTest(test_util.TensorFlowTestCase): del strong_y self.assertEqual([], list(weak_key_dict)) + def testEagerTensorsCanBeGarbageCollected(self): + x = constant_op.constant([[1.]]) + y = constant_op.constant([[2.]]) + x.y = y + y.x = x + weak_x = weakref.ref(x) + weak_y = weakref.ref(y) + del x + del y + gc.collect() + self.assertIs(weak_x(), None) + self.assertIs(weak_y(), None) + if __name__ == '__main__': test.main() diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc index 4dbdc2895fd..40f7be586be 100644 --- a/tensorflow/python/eager/pywrap_tensor.cc +++ b/tensorflow/python/eager/pywrap_tensor.cc @@ -46,7 +46,8 @@ TFE_Context* GetContext(PyObject* ctx) { if (context == nullptr) { PyErr_SetString(PyExc_TypeError, tensorflow::strings::StrCat( - "Expecting a PyCapsule encoded context handle. Got ", + "Expected context._handle to contain a PyCapsule " + "encoded pointer to TFE_Context. Got ", Py_TYPE(ctx)->tp_name) .c_str()); } @@ -369,6 +370,10 @@ typedef struct EagerTensor { // thread-safe. TF_Status* status; + // The eager Context (from eager/context.py) used by this Tensor. + // This is currently used only to make sure context outlives TensorHandles. + PyObject* context; + PyObject* weakreflist; /* List of weak references */ // Per-instance attribute dictionary, to support monkey patching @@ -426,6 +431,7 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) { self->status = TF_NewStatus(); self->dict = nullptr; self->weakreflist = nullptr; + self->context = nullptr; PyObject* value; PyObject* context = nullptr; PyObject* device = nullptr; @@ -439,6 +445,21 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) { return -1; } + tensorflow::Safe_PyObjectPtr context_handle( + PyObject_GetAttrString(context, "_handle")); + if (context_handle == nullptr) { + // Current Python code makes sure this never happens. If it does, or + // becomes hard to maintain, we can call the ensure_initialized() method + // here. + PyErr_SetString( + PyExc_TypeError, + "Expected `context` argument in EagerTensor constructor to have a " + "`_handle` field but it did not. Was eager Context initialized?"); + return -1; + } + self->context = context; + Py_INCREF(self->context); + if (other_value != nullptr) { if (!EagerTensor_CheckExact(other_value)) { PyErr_SetString(PyExc_TypeError, @@ -475,7 +496,7 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) { PyErr_Clear(); tensorflow::Safe_TFE_TensorHandlePtr handle = tensorflow::make_safe(tensorflow::ConvertToEagerTensor( - GetContext(context), value, desired_dtype)); + GetContext(context_handle.get()), value, desired_dtype)); if (handle == nullptr) return -1; // Almost all TensorFlow kernels for GPU devices keep int32 tensors in host @@ -507,7 +528,8 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) { if (TFE_TensorHandleDataType(handle.get()) != TF_INT32) { // Note that this is a shallow copy and will share the underlying buffer // if copying to the same device. - handle = tensorflow::make_safe(CopyToDevice(handle.get(), context, device)); + handle = tensorflow::make_safe( + CopyToDevice(handle.get(), context_handle.get(), device)); if (handle == nullptr) return -1; } self->handle = handle.release(); @@ -540,6 +562,10 @@ void EagerTensor_dealloc(EagerTensor* self) { TFE_DeleteTensorHandle(self->handle); self->handle = nullptr; } + + // Decref context after deleting the tensor handle. + Py_XDECREF(self->context); + // We have the global interpreter lock, so use this chance to perform delayed // refcount decrements. tensorflow::ClearDecrefCache(); @@ -874,6 +900,13 @@ PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) { t->handle = handle; t->status = TF_NewStatus(); t->weakreflist = nullptr; + PyObject* context = GetPyEagerContext(); + if (context == nullptr) { + LOG(ERROR) << "Cannot create an eager tensor before eager context has " + "been set or after it has been deleted"; + return nullptr; + } + t->context = context; if (!MaybeInvokeCreatedOnEagerTensorProfiler(t)) { return nullptr; diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h index 57e1e2dd016..574f1115b89 100755 --- a/tensorflow/python/eager/pywrap_tfe.h +++ b/tensorflow/python/eager/pywrap_tfe.h @@ -280,4 +280,20 @@ PyObject* TFE_Py_EncodeArg(PyObject*, bool include_tensor_ranks_only); void TFE_Py_EnableInteractivePythonLogging(); +// Sets `python_context` as the current eager Context object (defined +// in eager/context.py). This function must be called at least once before +// eager tensors are created. +// If an error is encountered, sets python error and returns NULL. Else, returns +// Py_None. +// +// This function is not thread-safe. +PyObject* TFE_Py_SetEagerContext(PyObject* python_context); + +// Returns the current eager Context object (defined in eager/context.py) +// that was last set using TFE_Py_SetEagerContext. +// If an error is encountered, sets python error and returns NULL. +// The returned PyObject is "new", i.e. the caller must call Py_DECREF on it at +// some point. +PyObject* GetPyEagerContext(); + #endif // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_ diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc index dfe45d17aa7..9b6ac1ab2c2 100644 --- a/tensorflow/python/eager/pywrap_tfe_src.cc +++ b/tensorflow/python/eager/pywrap_tfe_src.cc @@ -3499,3 +3499,31 @@ void TFE_Py_EnableInteractivePythonLogging() { TF_RegisterLogListener(PrintToPythonStdout); } } + +namespace { +// weak reference to Python Context object currently active +PyObject* weak_eager_context = nullptr; +} // namespace + +PyObject* TFE_Py_SetEagerContext(PyObject* python_context) { + Py_XDECREF(weak_eager_context); + weak_eager_context = PyWeakref_NewRef(python_context, nullptr); + if (weak_eager_context == nullptr) { + return nullptr; + } + Py_RETURN_NONE; +} + +PyObject* GetPyEagerContext() { + if (weak_eager_context == nullptr) { + PyErr_SetString(PyExc_ValueError, "Python eager context is not set"); + return nullptr; + } + PyObject* context = PyWeakref_GET_OBJECT(weak_eager_context); + if (context == Py_None) { + LOG(ERROR) << "Eager context has been destroyed"; + return nullptr; + } + Py_INCREF(context); + return context; +} diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py index 0059cdf1069..c43305853b5 100644 --- a/tensorflow/python/eager/tensor_test.py +++ b/tensorflow/python/eager/tensor_test.py @@ -48,7 +48,7 @@ def _create_tensor(value, device=None, dtype=None): dtype = dtype.as_datatype_enum try: return ops.EagerTensor( - value, context=ctx._handle, device=device, dtype=dtype) + value, context=ctx, device=device, dtype=dtype) except core._NotOkStatusException as e: # pylint: disable=protected-access raise core._status_to_exception(e.code, e.message) @@ -67,7 +67,6 @@ class TFETensorTest(test_util.TensorFlowTestCase): def testBadConstructorArgs(self): context.ensure_initialized() ctx = context.context() - handle = ctx._handle device = ctx.device_name # Missing context. with self.assertRaisesRegexp( @@ -76,11 +75,11 @@ class TFETensorTest(test_util.TensorFlowTestCase): # Missing device. with self.assertRaisesRegexp( TypeError, r".*argument 'device' \(pos 3\).*"): - ops.EagerTensor(1, context=handle) + ops.EagerTensor(1, context=ctx) # Bad dtype type. with self.assertRaisesRegexp(TypeError, "Expecting a DataType value for dtype. Got"): - ops.EagerTensor(1, context=handle, device=device, dtype="1") + ops.EagerTensor(1, context=ctx, device=device, dtype="1") # Following errors happen when trying to copy to GPU. if not test_util.is_gpu_available(): @@ -90,12 +89,14 @@ class TFETensorTest(test_util.TensorFlowTestCase): device = ctx.device_name # Bad context. with self.assertRaisesRegexp( - TypeError, "Expecting a PyCapsule encoded context handle. Got"): + TypeError, + "Expected `context` argument in EagerTensor constructor to have a " + "`_handle` field but it did not. Was eager Context initialized?"): ops.EagerTensor(1.0, context=1, device=device) # Bad device. with self.assertRaisesRegexp( TypeError, "Error parsing device argument to CopyToDevice"): - ops.EagerTensor(1.0, context=handle, device=1) + ops.EagerTensor(1.0, context=ctx, device=1) def testNumpyValue(self): values = np.array([3.0]) @@ -122,7 +123,7 @@ class TFETensorTest(test_util.TensorFlowTestCase): # Bad dtype value. with self.assertRaisesRegexp(TypeError, "Invalid dtype argument value"): ops.EagerTensor( - values, context=ctx._handle, device=ctx.device_name, dtype=12345) + values, context=ctx, device=ctx.device_name, dtype=12345) def testNumpyOrderHandling(self): n = np.array([[1, 2], [3, 4]], order="F") @@ -537,6 +538,5 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase): ValueError, "non-rectangular Python sequence"): constant_op.constant(l) - if __name__ == "__main__": test.main() diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py index b092d0d3c2e..a4b2769bfc2 100644 --- a/tensorflow/python/framework/constant_op.py +++ b/tensorflow/python/framework/constant_op.py @@ -96,7 +96,6 @@ def convert_to_eager_tensor(value, ctx, dtype=None): dtype = dtypes.as_dtype(dtype).as_datatype_enum ctx.ensure_initialized() device = ctx.device_name - handle = ctx._handle # pylint: disable=protected-access if isinstance(value, (float,) + six.integer_types): # Use a scalar cache. This will put each scalar of each type only once on # each device. Scalars don't use much device memory but copying scalars can @@ -106,12 +105,12 @@ def convert_to_eager_tensor(value, ctx, dtype=None): tensor = scalar_cache.get(cache_key, None) if tensor is not None: return ops.EagerTensor( - value, handle, device, dtype, tensor) - t = ops.EagerTensor(value, handle, device, dtype) + value, ctx, device, dtype, tensor) + t = ops.EagerTensor(value, ctx, device, dtype) scalar_cache[cache_key] = t return t else: - return ops.EagerTensor(value, handle, device, dtype) + return ops.EagerTensor(value, ctx, device, dtype) @tf_export(v1=["constant"]) diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index e4a68e08ab0..d710e7db0cf 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -5671,28 +5671,29 @@ def enable_eager_execution_internal(config=None, "tf.enable_eager_execution must be called at program startup.") context.default_execution_mode = context.EAGER_MODE # pylint: disable=protected-access - if context._context is None: - context._context = context.Context( - config=config, - device_policy=device_policy, - execution_mode=execution_mode, - server_def=server_def) - elif ((config is not None and config is not context._context._config) or - (device_policy is not None and - device_policy is not context._context._device_policy) or - (execution_mode is not None and - execution_mode is not context._context._execution_mode)): - raise ValueError( - "Trying to change the options of an active eager" - " execution. Context config: %s, specified config:" - " %s. Context device policy: %s, specified device" - " policy: %s. Context execution mode: %s, " - " specified execution mode %s." % - (context._context._config, config, context._context._device_policy, - device_policy, context._context._execution_mode, execution_mode)) - else: - # We already created everything, so update the thread local data. - context._context._thread_local_data.is_eager = True + with context._context_lock: + if context._context is None: + context._set_context_locked(context.Context( + config=config, + device_policy=device_policy, + execution_mode=execution_mode, + server_def=server_def)) + elif ((config is not None and config is not context._context._config) or + (device_policy is not None and + device_policy is not context._context._device_policy) or + (execution_mode is not None and + execution_mode is not context._context._execution_mode)): + raise ValueError( + "Trying to change the options of an active eager" + " execution. Context config: %s, specified config:" + " %s. Context device policy: %s, specified device" + " policy: %s. Context execution mode: %s, " + " specified execution mode %s." % + (context._context._config, config, context._context._device_policy, + device_policy, context._context._execution_mode, execution_mode)) + else: + # We already created everything, so update the thread local data. + context._context._thread_local_data.is_eager = True # Monkey patch to get rid of an unnecessary conditional since the context is # now initialized. diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i index f07f8dffd73..e9d4bdd7c6e 100755 --- a/tensorflow/python/pywrap_tfe.i +++ b/tensorflow/python/pywrap_tfe.i @@ -96,6 +96,7 @@ limitations under the License. %rename("%s") TFE_Py_TensorShapeSlice; %rename("%s") TFE_Py_TensorShapeOnDevice; %rename("%s") TFE_Py_EnableInteractivePythonLogging; +%rename("%s") TFE_Py_SetEagerContext; %rename("%s") TFE_ContextStartStep; %rename("%s") TFE_ContextEndStep; %rename("%s") TFE_Py_RegisterVSpace; From cad41daf444453df64f93e669c56bdb1d9fc9d8b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 17:45:08 -0700 Subject: [PATCH 0363/3053] Metal: depthwise convolution test added PiperOrigin-RevId: 259443075 --- .../lite/delegates/gpu/metal/kernels/BUILD | 22 +++ .../gpu/metal/kernels/depthwise_conv_test.mm | 172 ++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100644 tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv_test.mm diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD index f1ce542dad7..3a33b73b5d0 100644 --- a/tensorflow/lite/delegates/gpu/metal/kernels/BUILD +++ b/tensorflow/lite/delegates/gpu/metal/kernels/BUILD @@ -158,6 +158,28 @@ cc_library( ], ) +objc_library( + name = "depthwise_conv_test_lib", + testonly = 1, + srcs = ["depthwise_conv_test.mm"], + sdk_frameworks = ["XCTest"], + deps = [ + ":depthwise_conv", + ":test_util", + ], +) + +ios_unit_test( + name = "depthwise_conv_test", + testonly = 1, + minimum_os_version = "9.0", + tags = [ + "notap", + "tflite_not_portable_android", + ], + deps = [":depthwise_conv_test_lib"], +) + cc_library( name = "elementwise", srcs = ["elementwise.cc"], diff --git a/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv_test.mm b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv_test.mm new file mode 100644 index 00000000000..f4215be5ad5 --- /dev/null +++ b/tensorflow/lite/delegates/gpu/metal/kernels/depthwise_conv_test.mm @@ -0,0 +1,172 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/delegates/gpu/metal/kernels/add.h" + +#import + +#include + +#include "tensorflow/lite/delegates/gpu/common/operations.h" +#include "tensorflow/lite/delegates/gpu/common/shape.h" +#include "tensorflow/lite/delegates/gpu/common/status.h" +#include "tensorflow/lite/delegates/gpu/common/tensor.h" +#include "tensorflow/lite/delegates/gpu/common/util.h" +#include "tensorflow/lite/delegates/gpu/metal/compute_task_descriptor.h" +#include "tensorflow/lite/delegates/gpu/metal/kernels/test_util.h" +#include "tensorflow/lite/delegates/gpu/metal/runtime_options.h" + +using ::tflite::gpu::Axis; +using ::tflite::gpu::DepthwiseConvolution2DAttributes; +using ::tflite::gpu::DataType; +using ::tflite::gpu::BHWC; +using ::tflite::gpu::HW; +using ::tflite::gpu::Linear; +using ::tflite::gpu::metal::CompareVectors; +using ::tflite::gpu::metal::SingleOpModel; +using ::tflite::gpu::OperationType; +using ::tflite::gpu::OHWI; +using ::tflite::gpu::Tensor; +using ::tflite::gpu::TensorRef; + +@interface DepthwiseConvTest : XCTestCase +@end + +@implementation DepthwiseConvTest +- (void)setUp { + [super setUp]; +} + +- (void)testO4H1W1I2Strides1x1Dilation1x1 { + TensorRef input; + input.type = DataType::FLOAT32; + input.ref = 0; + input.shape = BHWC(1, 1, 1, 2); + + DepthwiseConvolution2DAttributes attr; + Tensor bias; + bias.shape.v = 4; + bias.id = 1; + bias.data = {1, 2, 3, 4}; + attr.bias = std::move(bias); + + Tensor weights; + weights.shape = OHWI(2, 1, 1, 2); + weights.id = 2; + weights.data = {1, 3, 2, 4}; + + attr.weights = std::move(weights); + + attr.dilations = HW(1, 1); + attr.padding.prepended = HW(0, 0); + attr.padding.appended = HW(0, 0); + attr.strides = HW(1, 1); + + TensorRef output; + output.type = DataType::FLOAT32; + output.ref = 3; + output.shape = BHWC(1, 1, 1, 4); + + SingleOpModel model( + {ToString(OperationType::DEPTHWISE_CONVOLUTION), std::move(attr)}, {input}, + {output}); + XCTAssertTrue(model.PopulateTensor(0, {1, 3})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({2, 4, 12, 16}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testO2H1W1I1Strides2x2Dilation1x1 { + TensorRef input; + input.type = DataType::FLOAT32; + input.ref = 0; + input.shape = BHWC(1, 3, 3, 1); + + DepthwiseConvolution2DAttributes attr; + Tensor bias; + bias.shape.v = 4; + bias.id = 1; + bias.data = {0, 0}; + attr.bias = std::move(bias); + + Tensor weights; + weights.shape = OHWI(2, 1, 1, 1); + weights.id = 1; + weights.data = {1, 3}; + + attr.weights = std::move(weights); + + attr.dilations = HW(1, 1); + attr.padding.prepended = HW(0, 0); + attr.padding.appended = HW(0, 0); + attr.strides = HW(2, 2); + + TensorRef output; + output.type = DataType::FLOAT32; + output.ref = 3; + output.shape = BHWC(1, 2, 2, 2); + + SingleOpModel model( + {ToString(OperationType::DEPTHWISE_CONVOLUTION), std::move(attr)}, {input}, + {output}); + XCTAssertTrue(model.PopulateTensor(0, {1, 0, 1, 1, 0, 1, 1, 0, 1})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({1, 3, 1, 3, 1, 3, 1, 3}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +- (void)testO2H2W2I1Strides1x1Dilation2x2 { + TensorRef input; + input.type = DataType::FLOAT32; + input.ref = 0; + input.shape = BHWC(1, 3, 3, 1); + + DepthwiseConvolution2DAttributes attr; + Tensor bias; + bias.shape.v = 4; + bias.id = 1; + bias.data = {0, 0}; + attr.bias = std::move(bias); + + Tensor weights; + weights.shape = OHWI(2, 2, 2, 1); + weights.id = 1; + weights.data = {1, 2, 3, 4, 5, 6, 7, 8}; + + attr.weights = std::move(weights); + + attr.dilations = HW(2, 2); + attr.padding.prepended = HW(0, 0); + attr.padding.appended = HW(0, 0); + attr.strides = HW(1, 1); + + TensorRef output; + output.type = DataType::FLOAT32; + output.ref = 3; + output.shape = BHWC(1, 1, 1, 2); + + SingleOpModel model( + {ToString(OperationType::DEPTHWISE_CONVOLUTION), std::move(attr)}, {input}, + {output}); + XCTAssertTrue(model.PopulateTensor(0, {1, 0, 1, 1, 0, 1, 1, 0, 1})); + auto status = model.Invoke(); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); + status = CompareVectors({10, 26}, model.GetOutput(0), 1e-6f); + XCTAssertTrue(status.ok(), @"%s", status.ToString().c_str()); +} + +@end From 30b4c9b63763dfce99b90861d7ca46783472ffaf Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Mon, 22 Jul 2019 17:47:08 -0700 Subject: [PATCH 0364/3053] [Grappler] Cancel Transpose nodes around Pad PiperOrigin-RevId: 259443368 --- tensorflow/core/grappler/optimizers/BUILD | 2 + .../optimizers/generic_layout_optimizer.cc | 92 ++++++++++++++++++- .../optimizers/generic_layout_optimizer.h | 10 +- .../generic_layout_optimizer_test.cc | 65 ++++++++++++- .../generic_layout_optimizer_transposer.cc | 18 ++-- .../generic_layout_optimizer_transposer.h | 8 +- 6 files changed, 179 insertions(+), 16 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD index afc8c5f7b25..42e7bef280a 100644 --- a/tensorflow/core/grappler/optimizers/BUILD +++ b/tensorflow/core/grappler/optimizers/BUILD @@ -1087,12 +1087,14 @@ tf_cuda_cc_test( "//tensorflow/core:tensorflow", "//tensorflow/core:test", "//tensorflow/core:test_main", + "//tensorflow/core:testlib", "//tensorflow/core/grappler:devices", "//tensorflow/core/grappler:grappler_item", "//tensorflow/core/grappler/clusters:cluster", "//tensorflow/core/grappler/clusters:single_machine", "//tensorflow/core/grappler/clusters:virtual_cluster", "//tensorflow/core/grappler/utils:graph_view", + "//tensorflow/core/grappler/utils:grappler_test", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", ], diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc index 0318baf7b19..38393e14a5c 100644 --- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc @@ -156,11 +156,13 @@ inline bool IsCancellableConstPermTransposeNodePair( const utils::MutableNodeView& fanout_transpose, const utils::MutableNodeView& fanin_transpose) { Tensor fanout_tensor; - if (!GetValueAttrIfConstPermTransposeNode(fanout_transpose, &fanout_tensor)) { + if (!GetValueAttrFromConstInputNode(fanout_transpose, IsTranspose, 1, + &fanout_tensor)) { return false; } Tensor fanin_tensor; - if (!GetValueAttrIfConstPermTransposeNode(fanin_transpose, &fanin_tensor)) { + if (!GetValueAttrFromConstInputNode(fanin_transpose, IsTranspose, 1, + &fanin_tensor)) { return false; } if (fanout_tensor.NumElements() != fanin_tensor.NumElements()) { @@ -255,6 +257,87 @@ Status EraseCancellableNodes(TransposeContext* context) { return mutation->Apply(); } +// TODO(ezhulenev): This is a temporary workaround for a graph pattern +// in Resnet models. We should be able to push down transpose nodes across Pad +// and many other ops, and then rely on cancellation to remove them. +// +// From: Transpose[NHWC->NCHW] -> Pad[paddings] -> Transpose[NCHW->NHWC] +// To: Pad[Permute(paddings)] +Status EraseCancellableNodesAroundPad(TransposeContext* context) { + utils::MutableGraphView* graph_view = context->graph_view.get(); + utils::Mutation* mutation = graph_view->GetMutationBuilder(); + + const int num_nodes = graph_view->NumNodes(); + for (int i = 0; i < num_nodes; ++i) { + // Transpose node after Pad. + auto* transpose_after = graph_view->GetNode(i); + if (!IsTranspose(*transpose_after->node())) continue; + + // Pad node. + const auto& transpose_after_fanin = transpose_after->GetRegularFanin(0); + auto* pad = transpose_after_fanin.node_view(); + if (!IsPad(*pad->node())) continue; + + // Transpose node before Pad. + const auto& pad_fanin_0 = pad->GetRegularFanin(0); + auto* transpose_before = pad_fanin_0.node_view(); + if (!IsTranspose(*transpose_before->node())) continue; + + // Transpose before output used once by the Pad node. + if (transpose_before->NumRegularFanouts() != 1) continue; + + // Transposes are cancellable. + if (!IsCancellableConstPermTransposeNodePair(*transpose_after, + *transpose_before)) + continue; + + // Paddings are known constant values. + Tensor paddings_t; + if (!GetValueAttrFromConstInputNode(*pad, IsPad, 1, &paddings_t)) continue; + + // Paddings value used once by the pad node only. + const auto& pad_fanin_1 = pad->GetRegularFanin(1); + auto* paddings = pad_fanin_1.node_view(); + if (paddings->NumRegularFanouts() != 1) continue; + + // Get permutation after the padding. + Tensor permute_t; + if (!GetValueAttrFromConstInputNode(*transpose_after, IsTranspose, 1, + &permute_t)) + continue; + + VLOG(0) << "Cancel transpose node pair around pad node:" + << " transpose_before=" << transpose_before->node()->name() + << " pad=" << pad->node()->name() + << " transpose_after=" << transpose_after->node()->name(); + + // Permute paddings in place according to permutation in second transpose. + auto permutation_s = absl::Span(permute_t.flat().data(), + permute_t.NumElements()); + auto paddings_s = absl::Span(paddings_t.flat().data(), + paddings_t.NumElements()); + TF_RETURN_IF_ERROR(PermuteDouble(permutation_s, &paddings_s)); + + // Update paddings constant value with a permuted tensor. + AttrValue permuted_paddings_tensor; + paddings_t.AsProtoTensorContent(permuted_paddings_tensor.mutable_tensor()); + mutation->AddOrUpdateNodeAttr(paddings, "value", permuted_paddings_tensor); + + // Transform Transpose nodes into Identity nodes. + const auto transpose_to_identity = + [&mutation](utils::MutableNodeView* transpose) -> void { + mutation->UpdateNodeOp(transpose, "Identity"); + mutation->RemoveNodeAttr(transpose, "Tperm"); + mutation->RemoveRegularFanin(transpose, 1); + }; + + transpose_to_identity(transpose_before); + transpose_to_identity(transpose_after); + } + + return mutation->Apply(); +} + Status EraseOutputShapeAttrs(TransposeContext* context) { utils::MutableGraphView* graph_view = context->graph_view.get(); utils::Mutation* mutation = graph_view->GetMutationBuilder(); @@ -284,6 +367,8 @@ Status GenericLayoutOptimizer::Optimize(Cluster* cluster, "GPU."); } + const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE; + TransposeContext context; TF_RETURN_IF_ERROR( TransposeContext::InitializeTransposeContext(item, cluster, &context)); @@ -295,9 +380,10 @@ Status GenericLayoutOptimizer::Optimize(Cluster* cluster, TransposerFactory transposer_factory; TF_RETURN_IF_ERROR(ExpandLayoutSensitiveOp(&context, &transposer_factory)); - if (context.graph.node_size() > context.num_nodes) { + if (context.graph.node_size() > context.num_nodes || is_aggressive) { TF_RETURN_IF_ERROR(ExpandLayoutAgnosticOp(&context, &transposer_factory)); TF_RETURN_IF_ERROR(EraseCancellableNodes(&context)); + TF_RETURN_IF_ERROR(EraseCancellableNodesAroundPad(&context)); // TODO(lyandy): Remove sorting once other optimizers are migrated to using // `utils::GraphView`. TF_RETURN_IF_ERROR( diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h index af8a2e395d3..9335b1d9dae 100644 --- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h +++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.h @@ -17,6 +17,7 @@ limitations under the License. #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GENERIC_LAYOUT_OPTIMIZER_H_ #include "tensorflow/core/grappler/optimizers/graph_optimizer.h" +#include "tensorflow/core/protobuf/rewriter_config.pb.h" namespace tensorflow { namespace grappler { @@ -24,8 +25,10 @@ namespace grappler { // Optimize the data layout for convolutional models. class GenericLayoutOptimizer : public GraphOptimizer { public: - GenericLayoutOptimizer() : GraphOptimizer() {} - ~GenericLayoutOptimizer() override {} + GenericLayoutOptimizer() : GenericLayoutOptimizer(RewriterConfig::DEFAULT) {} + explicit GenericLayoutOptimizer(RewriterConfig::Toggle opt_level) + : opt_level_(opt_level) {} + ~GenericLayoutOptimizer() override = default; string name() const override { return "layout"; }; @@ -34,6 +37,9 @@ class GenericLayoutOptimizer : public GraphOptimizer { void Feedback(Cluster* cluster, const GrapplerItem& item, const GraphDef& optimize_output, double result) override; + + private: + RewriterConfig::Toggle opt_level_; }; } // namespace grappler diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc index a48fde74c09..3a6316eef25 100644 --- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/cc/ops/const_op.h" #include "tensorflow/cc/ops/nn_ops.h" #include "tensorflow/cc/ops/standard_ops.h" +#include "tensorflow/core/framework/function_testlib.h" #include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/framework/tensor_testutil.h" #include "tensorflow/core/grappler/clusters/cluster.h" @@ -29,6 +30,7 @@ limitations under the License. #include "tensorflow/core/grappler/devices.h" #include "tensorflow/core/grappler/grappler_item.h" #include "tensorflow/core/grappler/utils/graph_view.h" +#include "tensorflow/core/grappler/utils/grappler_test.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/test.h" @@ -117,7 +119,7 @@ Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size, return conv_backprop_input; } -class GenericLayoutOptimizerTest : public ::testing::Test { +class GenericLayoutOptimizerTest : public GrapplerTest { protected: void SetUp() override { bool gpu_available = GetNumAvailableGPUs() > 0; @@ -525,6 +527,67 @@ TEST_F(GenericLayoutOptimizerTest, DoNotPruneNonAddedCancellableTransposes) { 0); } +TEST_F(GenericLayoutOptimizerTest, CancelTransposeAroundPad) { + using test::function::NDef; + + GenericLayoutOptimizer optimizer(RewriterConfig::AGGRESSIVE); + + const Tensor kPermuteNhwcToNchw = test::AsTensor({0, 3, 1, 2}); + const Tensor kPermuteNchwToNhwc = test::AsTensor({0, 2, 3, 1}); + const Tensor kPad = test::AsTensor({1, 2, 3, 4, 5, 6, 7, 8}, {4, 2}); + + GrapplerItem item; + item.graph = test::function::GDef({ + NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}), + + NDef("paddings", "Const", {}, {{"dtype", DT_INT32}, {"value", kPad}}), + NDef("perm_nhwc_to_nchw", "Const", {}, + {{"dtype", DT_INT32}, {"value", kPermuteNhwcToNchw}}), + NDef("perm_nchw_to_nhwc", "Const", {}, + {{"dtype", DT_INT32}, {"value", kPermuteNchwToNhwc}}), + + NDef("transpose_0", "Transpose", {"x", "perm_nhwc_to_nchw"}, + {{"T", DT_FLOAT}, {"Tperm", DT_INT32}}), + NDef("pad", "Pad", {"transpose_0", "paddings"}, + {{"T", DT_FLOAT}, {"Tpaddings", DT_INT32}}), + NDef("transpose_1", "Transpose", {"pad", "perm_nchw_to_nhwc"}, + {{"T", DT_FLOAT}, {"Tperm", DT_INT32}}), + }); + + GraphDef output; + TF_EXPECT_OK(optimizer.Optimize(virtual_cluster_.get(), item, &output)); + + const Tensor kPermutedPaddings = + test::AsTensor({1, 2, 5, 6, 7, 8, 3, 4}, {4, 2}); + + GraphDef expected = test::function::GDef({ + NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}), + + NDef("paddings", "Const", {}, + {{"dtype", DT_INT32}, {"value", kPermutedPaddings}}), + NDef("perm_nhwc_to_nchw", "Const", {}, + {{"dtype", DT_INT32}, {"value", kPermuteNhwcToNchw}}), + NDef("perm_nchw_to_nhwc", "Const", {}, + {{"dtype", DT_INT32}, {"value", kPermuteNchwToNhwc}}), + + // Transpose nodes replaced by Identity nodes. + NDef("transpose_0", "Identity", {"x"}, {{"T", DT_FLOAT}}), + NDef("pad", "Pad", {"transpose_0", "paddings"}, + {{"T", DT_FLOAT}, {"Tpaddings", DT_INT32}}), + NDef("transpose_1", "Identity", {"pad"}, {{"T", DT_FLOAT}}), + }); + + CompareGraphs(expected, output); + + Tensor x = GenerateRandomTensor({2, 6, 6, 8}); + item.fetch = {"transpose_1"}; + item.feed.emplace_back("x", x); + auto tensors_expected = EvaluateFetchNodes(item); + GrapplerItem optimized = item.WithGraph(std::move(output)); + auto tensors = EvaluateFetchNodes(optimized); + test::ExpectTensorEqual(tensors_expected[0], tensors[0]); +} + // TODO(yanzha): Add more complex Graph for test. } // namespace grappler diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc index 2b8a1eb8970..87960edffe1 100644 --- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc +++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc @@ -837,7 +837,7 @@ Status MaxPoolGradV2Transposer::TransposeNode(TransposeContext* context, inline bool IsValidConstPermTransposeNode(const utils::MutableNodeView& node, absl::Span permutation) { Tensor tensor; - if (!GetValueAttrIfConstPermTransposeNode(node, &tensor)) { + if (!GetValueAttrFromConstInputNode(node, IsTranspose, 1, &tensor)) { return false; } if (tensor.NumElements() != permutation.size()) { @@ -1799,17 +1799,19 @@ std::vector GetDataFanoutPorts(const utils::MutableNodeView& node) { return {0}; } -bool GetValueAttrIfConstPermTransposeNode(const utils::MutableNodeView& node, - Tensor* tensor) { - if (!IsTranspose(*node.node())) { +bool GetValueAttrFromConstInputNode( + const utils::MutableNodeView& node, + const std::function& predicate, int index, + Tensor* tensor) { + if (!predicate(*node.node())) { return false; } - const auto& regular_fanin_1 = node.GetRegularFanin(1); - auto* regular_fanin_1_node = regular_fanin_1.node_view(); - if (!IsConstant(*regular_fanin_1_node->node())) { + const auto& regular_fanin = node.GetRegularFanin(index); + auto* regular_fanin_node = regular_fanin.node_view(); + if (!IsConstant(*regular_fanin_node->node())) { return false; } - const auto* value_attr = regular_fanin_1_node->GetAttr(kAttrValue); + const auto* value_attr = regular_fanin_node->GetAttr(kAttrValue); if (value_attr == nullptr || value_attr->tensor().dtype() != DT_INT32) { return false; } diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h index be609e84596..0928b141895 100644 --- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h +++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.h @@ -593,8 +593,12 @@ std::vector GetDataFaninPorts(const utils::MutableNodeView& node); std::vector GetDataFanoutPorts(const utils::MutableNodeView& node); -bool GetValueAttrIfConstPermTransposeNode(const utils::MutableNodeView& node, - Tensor* tensor); +// Returns a value of constant input to the `node` at `index`, iff `predicate` +// evaluated to true. Returns true if `tensor` was populated with data. +bool GetValueAttrFromConstInputNode( + const utils::MutableNodeView& node, + const std::function& predicate, int index, + Tensor* tensor); bool IsDataFormatOp(const utils::MutableNodeView& node); From 58a00b3e046d2d3037d1933ca6e539ba983a64ac Mon Sep 17 00:00:00 2001 From: Zhenyu Tan Date: Mon, 22 Jul 2019 18:03:09 -0700 Subject: [PATCH 0365/3053] Fix optimizer test failure if run_distributed is True. PiperOrigin-RevId: 259445851 --- .../keras/optimizer_v2/optimizer_v2_test.py | 62 ++++++++++++++++--- tensorflow/python/keras/optimizers_test.py | 40 ++++++++++-- 2 files changed, 89 insertions(+), 13 deletions(-) diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py index 7fc63d1c59c..04816a80829 100644 --- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py +++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py @@ -609,10 +609,15 @@ class OptimizerTest(test.TestCase): self.assertEqual('outter/Adam/var_2/m:0', opt_vars[3].name) -@keras_parameterized.run_with_all_model_types +@keras_parameterized.run_all_keras_modes class OptimizersCompatibilityTest(keras_parameterized.TestCase): + # After run_distributed is turned on, optimizer v1 can no longer work in + # eager mode, skipping the test if so. def _testOptimizersCompatibility(self, opt_v1, opt_v2, test_weights=True): + if testing_utils.should_run_distributed() or context.executing_eagerly(): + self.skipTest('v1 optimizer does not run in run_distributed mode or ' + 'eager mode') np.random.seed(1331) with self.cached_session(): train_samples = 20 @@ -628,13 +633,23 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase): num_hidden = 5 model_v1 = testing_utils.get_small_sequential_mlp( num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim) - model_v1.compile(opt_v1, loss='categorical_crossentropy', metrics=[]) + model_v1.compile( + opt_v1, + loss='categorical_crossentropy', + metrics=[], + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) model_v1.fit(x, y, batch_size=5, epochs=1) model_v2 = testing_utils.get_small_sequential_mlp( num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim) model_v2.set_weights(model_v1.get_weights()) - model_v2.compile(opt_v2, loss='categorical_crossentropy', metrics=[]) + model_v2.compile( + opt_v2, + loss='categorical_crossentropy', + metrics=[], + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) model_v2._make_train_function() if test_weights: opt_v2.set_weights(opt_v1.get_weights()) @@ -687,6 +702,9 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase): self._testOptimizersCompatibility(opt_v1, opt_v2, False) def testNumericEquivalenceForNesterovMomentum(self): + if testing_utils.should_run_distributed() or context.executing_eagerly(): + self.skipTest('v1 optimizer does not run in run_distributed mode or ' + 'eager mode') np.random.seed(1331) with self.cached_session(): train_samples = 20 @@ -714,9 +732,24 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase): opt_tf = momentum.MomentumOptimizer( learning_rate=0.01, momentum=0.9, use_nesterov=True) - model_k_v1.compile(opt_k_v1, loss='categorical_crossentropy', metrics=[]) - model_k_v2.compile(opt_k_v2, loss='categorical_crossentropy', metrics=[]) - model_tf.compile(opt_tf, loss='categorical_crossentropy', metrics=[]) + model_k_v1.compile( + opt_k_v1, + loss='categorical_crossentropy', + metrics=[], + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) + model_k_v2.compile( + opt_k_v2, + loss='categorical_crossentropy', + metrics=[], + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) + model_tf.compile( + opt_tf, + loss='categorical_crossentropy', + metrics=[], + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) hist_k_v1 = model_k_v1.fit(x, y, batch_size=5, epochs=10, shuffle=False) hist_k_v2 = model_k_v2.fit(x, y, batch_size=5, epochs=10, shuffle=False) @@ -729,6 +762,9 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase): self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss']) def testNumericEquivalenceForAmsgrad(self): + if testing_utils.should_run_distributed() or context.executing_eagerly(): + self.skipTest('v1 optimizer does not run in run_distributed mode or ' + 'eager mode') np.random.seed(1331) with self.cached_session(): train_samples = 20 @@ -751,8 +787,18 @@ class OptimizersCompatibilityTest(keras_parameterized.TestCase): opt_k_v1 = optimizers.Adam(amsgrad=True) opt_k_v2 = adam.Adam(amsgrad=True) - model_k_v1.compile(opt_k_v1, loss='categorical_crossentropy', metrics=[]) - model_k_v2.compile(opt_k_v2, loss='categorical_crossentropy', metrics=[]) + model_k_v1.compile( + opt_k_v1, + loss='categorical_crossentropy', + metrics=[], + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) + model_k_v2.compile( + opt_k_v2, + loss='categorical_crossentropy', + metrics=[], + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) hist_k_v1 = model_k_v1.fit(x, y, batch_size=5, epochs=10, shuffle=False) hist_k_v2 = model_k_v2.fit(x, y, batch_size=5, epochs=10, shuffle=False) diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py index c6146d3aafe..9eb2c052c93 100644 --- a/tensorflow/python/keras/optimizers_test.py +++ b/tensorflow/python/keras/optimizers_test.py @@ -24,7 +24,9 @@ import weakref import numpy as np from tensorflow.python import keras +from tensorflow.python.eager import context from tensorflow.python.framework import ops +from tensorflow.python.keras import keras_parameterized from tensorflow.python.keras import testing_utils from tensorflow.python.platform import test from tensorflow.python.training.adam import AdamOptimizer @@ -39,16 +41,26 @@ def _get_model(input_dim, num_hidden, output_dim): return model -class KerasOptimizersTest(test.TestCase): +@keras_parameterized.run_all_keras_modes +class KerasOptimizersTest(keras_parameterized.TestCase): + # After run_distributed is turned on, optimizer v1 can no longer work in + # eager mode, skipping the test if so. def _test_optimizer(self, optimizer, target=0.75): + if testing_utils.should_run_distributed() or context.executing_eagerly(): + self.skipTest('v1 optimizer does not run in run_distributed mode or ' + 'eager mode') np.random.seed(1337) (x_train, y_train), _ = testing_utils.get_test_data( train_samples=1000, test_samples=200, input_shape=(10,), num_classes=2) y_train = keras.utils.to_categorical(y_train) model = _get_model(x_train.shape[1], 20, y_train.shape[1]) model.compile( - loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc']) + loss='categorical_crossentropy', + optimizer=optimizer, + metrics=['acc'], + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) np.testing.assert_equal( keras.backend.get_value(model.optimizer.iterations), 0) history = model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=0) @@ -84,7 +96,9 @@ class KerasOptimizersTest(test.TestCase): model.compile( loss='categorical_crossentropy', optimizer=optimizer, - metrics=['accuracy']) + metrics=['accuracy'], + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) np.testing.assert_equal( keras.backend.get_value(model.optimizer.iterations), 126) # Using same optimizer from before @@ -150,12 +164,18 @@ class KerasOptimizersTest(test.TestCase): keras.optimizers.SGD(lr=0.01, momentum=0.9, clipvalue=0.5)) def test_tf_optimizer(self): + if testing_utils.should_run_distributed() or context.executing_eagerly(): + self.skipTest('v1 optimizer does not run in run_distributed mode or ' + 'eager mode') optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01)) model = keras.models.Sequential() model.add(keras.layers.Dense( 2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1))) # This is possible - model.compile(loss='mean_squared_error', optimizer=optimizer) + model.compile(loss='mean_squared_error', + optimizer=optimizer, + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) keras.backend.track_tf_optimizer(optimizer) model.fit(np.random.random((5, 3)), np.random.random((5, 2)), @@ -171,6 +191,9 @@ class KerasOptimizersTest(test.TestCase): optimizer.from_config(None) def test_optimizer_garbage_collection(self): + if testing_utils.should_run_distributed() or context.executing_eagerly(): + self.skipTest('v1 optimizer does not run in run_distributed mode or ' + 'eager mode') graph = ops.Graph() with graph.as_default(): optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01)) @@ -184,12 +207,19 @@ class KerasOptimizersTest(test.TestCase): self.assertIs(optimizer_weak(), None) def test_tf_optimizer_iterations(self): + if testing_utils.should_run_distributed() or context.executing_eagerly(): + self.skipTest('v1 optimizer does not run in run_distributed mode or ' + 'eager mode') with self.cached_session(): optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01)) model = keras.models.Sequential() model.add(keras.layers.Dense( 2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1))) - model.compile(loss='mean_squared_error', optimizer=optimizer) + model.compile( + loss='mean_squared_error', + optimizer=optimizer, + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) keras.backend.track_tf_optimizer(optimizer) self.assertEqual(keras.backend.get_value(model.optimizer.iterations), 0) From 8c33208a94e1bbb5f51184ef496822cb98e718c9 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 18:05:11 -0700 Subject: [PATCH 0366/3053] Update ops-related pbtxt files. PiperOrigin-RevId: 259446187 --- .../core/ops/compat/ops_history.v1.pbtxt | 75 +++++++++++++++++++ tensorflow/core/ops/ops.pbtxt | 7 ++ 2 files changed, 82 insertions(+) diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index 13a1cb8e3bf..8d901ce7e03 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -88810,6 +88810,81 @@ op { } } } +op { + name: "TPUReplicateMetadata" + attr { + name: "num_replicas" + type: "int" + has_minimum: true + } + attr { + name: "num_cores_per_replica" + type: "int" + default_value { + i: 1 + } + } + attr { + name: "topology" + type: "string" + default_value { + s: "" + } + } + attr { + name: "use_tpu" + type: "bool" + default_value { + b: true + } + } + attr { + name: "device_assignment" + type: "list(int)" + default_value { + list { + } + } + } + attr { + name: "computation_shape" + type: "list(int)" + default_value { + list { + } + } + } + attr { + name: "host_compute_core" + type: "list(string)" + default_value { + list { + } + } + } + attr { + name: "padding_map" + type: "list(string)" + default_value { + list { + } + } + } + attr { + name: "step_marker_location" + type: "string" + default_value { + s: "STEP_MARK_AT_ENTRY" + } + } + attr { + name: "allow_soft_placement" + type: "bool" + default_value { + b: false + } + } +} op { name: "TPUReplicatedInput" input_arg { diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index 64bdb7c3253..ba9658c5084 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -44319,6 +44319,13 @@ op { s: "STEP_MARK_AT_ENTRY" } } + attr { + name: "allow_soft_placement" + type: "bool" + default_value { + b: false + } + } } op { name: "TPUReplicatedInput" From 1f911f0819078f435a9fc2cad836772436c51e7e Mon Sep 17 00:00:00 2001 From: Amit Patankar Date: Mon, 22 Jul 2019 18:43:27 -0700 Subject: [PATCH 0367/3053] Create a new cross compile toolchain for CentOS6 on Ubuntu16.04 with new TensorRT 5.1. PiperOrigin-RevId: 259450964 --- tensorflow/opensource_only.files | 19 +++---- .../toolchains/preconfig/generate/BUILD | 9 ++-- .../preconfig/generate/containers.bzl | 2 +- .../gcc7_manylinux2010-nvcc-cuda10.0/BUILD | 50 ++++++++---------- .../bin/crosstool_wrapper_driver_is_not_gcc | 4 +- .../windows/msvc_wrapper_for_nvcc.py | 4 +- .../preconfig/ubuntu16.04/tensorrt5.1/BUILD | 51 +++++++++++++++++++ .../{tensorrt5 => tensorrt5.1}/WORKSPACE | 0 .../{tensorrt5 => tensorrt5.1}/build_defs.bzl | 2 +- 9 files changed, 94 insertions(+), 47 deletions(-) create mode 100755 third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/BUILD rename third_party/toolchains/preconfig/ubuntu16.04/{tensorrt5 => tensorrt5.1}/WORKSPACE (100%) rename third_party/toolchains/preconfig/ubuntu16.04/{tensorrt5 => tensorrt5.1}/build_defs.bzl (76%) diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files index 27d7a82862d..ccf39fe0566 100644 --- a/tensorflow/opensource_only.files +++ b/tensorflow/opensource_only.files @@ -52,8 +52,15 @@ tensorflow/third_party/toolchains/preconfig/centos6/gcc7/cc_toolchain_config.bzl tensorflow/third_party/toolchains/preconfig/centos6/gcc7/dummy_toolchain.bzl tensorflow/third_party/toolchains/preconfig/centos6/gcc7/BUILD tensorflow/third_party/toolchains/preconfig/centos6/py3/BUILD -tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/build_defs.bzl -tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/BUILD +tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD +tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl +tensorflow/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD +tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl +tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD +tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD +tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl +tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/BUILD +tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/build_defs.bzl tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/BUILD tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/cc_toolchain_config.bzl tensorflow/third_party/toolchains/preconfig/ubuntu16.04/clang/dummy_toolchain.bzl @@ -64,16 +71,10 @@ tensorflow/third_party/toolchains/preconfig/ubuntu16.04/rocm/rocm/BUILD tensorflow/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/BUILD tensorflow/third_party/toolchains/preconfig/ubuntu16.04/cuda10.0-cudnn7/cuda/build_defs.bzl tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3/BUILD +tensorflow/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/BUILD tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc5-rocm/BUILD tensorflow/third_party/toolchains/preconfig/ubuntu16.04/gcc5-rocm/cc_toolchain_config.bzl tensorflow/third_party/toolchains/preconfig/ubuntu16.04/py3_opt/BUILD -tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/BUILD -tensorflow/third_party/toolchains/preconfig/ubuntu14.04/tensorrt5/build_defs.bzl -tensorflow/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD -tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl -tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD -tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD -tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/cc_toolchain_config.bzl tensorflow/third_party/toolchains/preconfig/generate/workspace.bzl tensorflow/third_party/toolchains/preconfig/generate/containers.bzl tensorflow/third_party/toolchains/preconfig/generate/generate.bzl diff --git a/third_party/toolchains/preconfig/generate/BUILD b/third_party/toolchains/preconfig/generate/BUILD index 261013871a5..2e6c6702506 100644 --- a/third_party/toolchains/preconfig/generate/BUILD +++ b/third_party/toolchains/preconfig/generate/BUILD @@ -86,14 +86,15 @@ tensorflow_rbe_config( ) tensorflow_rbe_config( - name = "ubuntu16.04-py3-gcc7_manylinux2010-cuda10.0-cudnn7-tensorrt5", - compiler = "gcc", - compiler_prefix = "/dt7/usr/bin", + name = "ubuntu16.04-py3-gcc7_manylinux2010-cuda10.0-cudnn7-tensorrt5.1", + compiler = "/dt7/usr/bin/gcc", + compiler_prefix = "/usr/bin", cuda_version = "10.0", cudnn_version = "7", os = "ubuntu16.04-manylinux2010", python_version = "3.6", - tensorrt_version = "5", + tensorrt_install_path = "/usr/local/tensorrt", + tensorrt_version = "5.1", ) tensorflow_rbe_config( diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl index e8c4ffddeae..6f692165a27 100644 --- a/third_party/toolchains/preconfig/generate/containers.bzl +++ b/third_party/toolchains/preconfig/generate/containers.bzl @@ -6,6 +6,6 @@ container_digests = { "cuda10.0-cudnn7-centos7": "sha256:a453b7147a60928a8345689eae48916a746b3578b5e831bfa151f0529d469c88", "cuda10.0-cudnn7-centos6": "sha256:a1909ba09c703340ee0074ce63dd94fe8fea48035a25264677907a609e2375e0", "cuda10.1-cudnn7-centos6": "sha256:454b899657e87893ee5e68dc0f87df59b6a0a7418ae09cafcc3dd65ac71feca9", - "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:eedcedfe63a778068bf725f9ffa425646725faac9ba96a57abfad307e832dcf9", + "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:76cdd3956ce714bedca4b0c5b34c08e77fda7e888b8814da973d95f45628761c", "rocm-ubuntu16.04": "sha256:2df35a0b7f7513b4ca820a12792e98ecafafabd1076300ef26f89386277c10cc", } diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/BUILD index 9a7a6a2281d..18b97f663ce 100755 --- a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/BUILD +++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/BUILD @@ -57,14 +57,12 @@ cc_toolchain( cc_toolchain_config( name = "cc-compiler-local-config", builtin_include_directories = [ - "/usr/include/c++/5", - "/usr/include/x86_64-linux-gnu/c++/5", - "/usr/include/c++/5/backward", - "/usr/lib/gcc/x86_64-linux-gnu/5/include", - "/usr/local/include", - "/usr/lib/gcc/x86_64-linux-gnu/5/include-fixed", - "/usr/include/x86_64-linux-gnu", - "/usr/include", + "/dt7/usr/include/c++/7", + "/dt7/usr/include/c++/7/x86_64-pc-linux-gnu", + "/dt7/usr/include/c++/7/backward", + "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include", + "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include-fixed", + "/dt7/usr/include", "/usr/local/cuda-10.0/targets/x86_64-linux/include", "/usr/local/cuda-10.0/include", "/usr/local/cuda-10.0/extras/CUPTI/include", @@ -73,10 +71,10 @@ cc_toolchain_config( cpu = "local", extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"], host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc", - host_compiler_prefix = "/dt7/usr/bin", + host_compiler_prefix = "/usr/bin", host_compiler_warnings = [], host_unfiltered_compile_flags = [], - linker_bin_path = "/dt7/usr/bin", + linker_bin_path = "/usr/bin", ) cc_toolchain( @@ -95,14 +93,12 @@ cc_toolchain( cc_toolchain_config( name = "cc-compiler-local-darwin", builtin_include_directories = [ - "/usr/include/c++/5", - "/usr/include/x86_64-linux-gnu/c++/5", - "/usr/include/c++/5/backward", - "/usr/lib/gcc/x86_64-linux-gnu/5/include", - "/usr/local/include", - "/usr/lib/gcc/x86_64-linux-gnu/5/include-fixed", - "/usr/include/x86_64-linux-gnu", - "/usr/include", + "/dt7/usr/include/c++/7", + "/dt7/usr/include/c++/7/x86_64-pc-linux-gnu", + "/dt7/usr/include/c++/7/backward", + "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include", + "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include-fixed", + "/dt7/usr/include", "/usr/local/cuda-10.0/targets/x86_64-linux/include", "/usr/local/cuda-10.0/include", "/usr/local/cuda-10.0/extras/CUPTI/include", @@ -111,10 +107,10 @@ cc_toolchain_config( cpu = "darwin", extra_no_canonical_prefixes_flags = ["-fno-canonical-system-headers"], host_compiler_path = "clang/bin/crosstool_wrapper_driver_is_not_gcc", - host_compiler_prefix = "/dt7/usr/bin", + host_compiler_prefix = "/usr/bin", host_compiler_warnings = [], host_unfiltered_compile_flags = [], - linker_bin_path = "/dt7/usr/bin", + linker_bin_path = "/usr/bin", ) cc_toolchain( @@ -133,14 +129,12 @@ cc_toolchain( cc_toolchain_config( name = "cc-compiler-windows-config", builtin_include_directories = [ - "/usr/include/c++/5", - "/usr/include/x86_64-linux-gnu/c++/5", - "/usr/include/c++/5/backward", - "/usr/lib/gcc/x86_64-linux-gnu/5/include", - "/usr/local/include", - "/usr/lib/gcc/x86_64-linux-gnu/5/include-fixed", - "/usr/include/x86_64-linux-gnu", - "/usr/include", + "/dt7/usr/include/c++/7", + "/dt7/usr/include/c++/7/x86_64-pc-linux-gnu", + "/dt7/usr/include/c++/7/backward", + "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include", + "/dt7/usr/lib/gcc/x86_64-pc-linux-gnu/7/include-fixed", + "/dt7/usr/include", "/usr/local/cuda-10.0/targets/x86_64-linux/include", "/usr/local/cuda-10.0/include", "/usr/local/cuda-10.0/extras/CUPTI/include", diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc index 8e01f1f1de2..9800b7689a3 100755 --- a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc +++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc @@ -46,8 +46,8 @@ import sys import pipes # Template values set by cuda_autoconf. -CPU_COMPILER = ('/usr/bin/gcc') -GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc') +CPU_COMPILER = ('/dt7/usr/bin/gcc') +GCC_HOST_COMPILER_PATH = ('/dt7/usr/bin/gcc') NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc' PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH) diff --git a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py index 510ba52fd5e..79b98e587e3 100755 --- a/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py +++ b/third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py @@ -30,8 +30,8 @@ import sys import pipes # Template values set by cuda_autoconf. -CPU_COMPILER = ('/usr/bin/gcc') -GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc') +CPU_COMPILER = ('/dt7/usr/bin/gcc') +GCC_HOST_COMPILER_PATH = ('/dt7/usr/bin/gcc') NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc' NVCC_VERSION = '10.0' diff --git a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/BUILD b/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/BUILD new file mode 100755 index 00000000000..574764d8dc1 --- /dev/null +++ b/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/BUILD @@ -0,0 +1,51 @@ +# NVIDIA TensorRT +# A high-performance deep learning inference optimizer and runtime. + +licenses(["notice"]) + +load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts") + +package(default_visibility = ["//visibility:public"]) + +exports_files(["LICENSE"]) + +cc_library( + name = "tensorrt_headers", + hdrs = [":tensorrt_include"], + include_prefix = "third_party/tensorrt", + strip_include_prefix = "tensorrt/include", +) + +cc_library( + name = "tensorrt", + srcs = [":tensorrt_lib"], + copts = cuda_default_copts(), + data = [":tensorrt_lib"], + linkstatic = 1, + deps = [ + ":tensorrt_headers", + "@local_config_cuda//cuda", + ], +) + +genrule( + name = "tensorrt_lib", + outs = [ + "tensorrt/lib/libnvinfer.so.5", + "tensorrt/lib/libnvinfer_plugin.so.5", + ], + cmd = """cp -f "/usr/local/tensorrt/lib/libnvinfer.so.5" "$(location tensorrt/lib/libnvinfer.so.5)" && \ +cp -f "/usr/local/tensorrt/lib/libnvinfer_plugin.so.5" "$(location tensorrt/lib/libnvinfer_plugin.so.5)" """, +) + +genrule( + name = "tensorrt_include", + outs = [ + "tensorrt/include/NvInfer.h", + "tensorrt/include/NvUtils.h", + "tensorrt/include/NvInferPlugin.h", + ], + cmd = """cp -f "/usr/local/tensorrt/include/NvInfer.h" "$(location tensorrt/include/NvInfer.h)" && \ +cp -f "/usr/local/tensorrt/include/NvUtils.h" "$(location tensorrt/include/NvUtils.h)" && \ +cp -f "/usr/local/tensorrt/include/NvInferPlugin.h" "$(location tensorrt/include/NvInferPlugin.h)" """, +) diff --git a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/WORKSPACE b/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/WORKSPACE similarity index 100% rename from third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/WORKSPACE rename to third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/WORKSPACE diff --git a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/build_defs.bzl similarity index 76% rename from third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/build_defs.bzl rename to third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/build_defs.bzl index 527be938341..4f242a5dae2 100755 --- a/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5/build_defs.bzl +++ b/third_party/toolchains/preconfig/ubuntu16.04/tensorrt5.1/build_defs.bzl @@ -1,4 +1,4 @@ -# Build configurations for TensorRT. +"""Build configurations for TensorRT.""" def if_tensorrt(if_true, if_false = []): """Tests whether TensorRT was enabled during the configure process.""" From 745b24b21de540097004772dd2105dbbb1102603 Mon Sep 17 00:00:00 2001 From: Ruoxin Sang Date: Mon, 22 Jul 2019 18:51:54 -0700 Subject: [PATCH 0368/3053] Fix a graph lifting bug which the same init op may be copied multiple times during the lifting. This may cause TPUMirroredVariable on different devices initialized to different values. PiperOrigin-RevId: 259451974 --- tensorflow/python/distribute/values_test.py | 46 +++++++++++++++++++++ tensorflow/python/eager/def_function.py | 6 ++- tensorflow/python/eager/lift_to_graph.py | 21 +++++++--- 3 files changed, 66 insertions(+), 7 deletions(-) diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py index 0bedcc9134b..753f3f3d360 100644 --- a/tensorflow/python/distribute/values_test.py +++ b/tensorflow/python/distribute/values_test.py @@ -29,6 +29,7 @@ from tensorflow.python.distribute import distribution_strategy_context from tensorflow.python.distribute import strategy_combinations from tensorflow.python.distribute import tpu_strategy from tensorflow.python.distribute import values +from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver from tensorflow.python.eager import context from tensorflow.python.eager import def_function from tensorflow.python.eager import test @@ -41,9 +42,11 @@ from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import random_ops from tensorflow.python.ops import variable_scope from tensorflow.python.ops import variables as variables_lib from tensorflow.python.saved_model.model_utils import mode_keys +from tensorflow.python.tpu import tpu_strategy_util from tensorflow.python.training import saver as saver_lib from tensorflow.python.training.tracking import util as trackable_utils from tensorflow.python.util import nest @@ -662,6 +665,28 @@ class MirroredVariableTest(test.TestCase, parameterized.TestCase): variable_scope.get_variable( name="testVar", initializer=1., use_resource=True) + @combinations.generate( + combinations.combine( + distribution=[ + strategy_combinations.mirrored_strategy_with_gpu_and_cpu, + strategy_combinations.tpu_strategy, + strategy_combinations.central_storage_strategy_with_two_gpus, + ], + mode=["eager"])) + def testInitializedToSameValueInsideEagerRun(self, distribution): + v = [None] + @def_function.function + def step(): + def f(): + if v[0] is None: + v[0] = variables_lib.Variable(random_ops.random_normal([])) + distribution.experimental_run_v2(f) + + context.set_global_seed(None) + step() + vals = self.evaluate(v[0].values) + self.assertAllEqual(vals[0], vals[1]) + _TPU_STRATEGIES = (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1) @@ -1031,6 +1056,9 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): variables_lib.VariableAggregation.ONLY_FIRST_REPLICA, ] for aggregation in aggregations: + if isinstance(distribution, _TPU_STRATEGIES): + resolver = tpu_cluster_resolver.TPUClusterResolver('') + tpu_strategy_util.initialize_tpu_system(resolver) with distribution.scope(): v = variable_scope.variable( 0., @@ -1065,6 +1093,24 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): ValueError, "Could not convert from .* VariableAggregation\\.NONE"): self.evaluate(v.read_value()) + def testInitializedToSameValueInsideEagerRun(self, distribution): + if not context.executing_eagerly(): self.skipTest("eager only") + + v = [None] + @def_function.function + def step(): + def f(): + if v[0] is None: + v[0] = variables_lib.Variable( + random_ops.random_normal([]), + synchronization=variables_lib.VariableSynchronization.ON_READ) + distribution.experimental_run_v2(f) + + context.set_global_seed(None) + step() + vals = self.evaluate(v[0].values) + self.assertAllEqual(vals[0], vals[1]) + class PerReplicaTest(test.TestCase, parameterized.TestCase): diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py index c5571b9bb6a..66c75024a33 100644 --- a/tensorflow/python/eager/def_function.py +++ b/tensorflow/python/eager/def_function.py @@ -511,13 +511,15 @@ class Function(object): # Note: using defun here avoids an infinite recursion. @function_lib.defun def initialize_variables(): + op_map = {} for v, init in initializer_map.items(): with ops.init_scope(): if resource_variable_ops.var_is_initialized_op(v.handle): # Ignore variables which are already initialized at trace time. continue - v.assign(lift_to_graph.lift_to_graph( - [init], ops.get_default_graph())[init]) + op_map = lift_to_graph.lift_to_graph( + [init], ops.get_default_graph(), op_map=op_map) + v.assign(op_map[init]) with ops.init_scope(): return initialize_variables.get_concrete_function()() diff --git a/tensorflow/python/eager/lift_to_graph.py b/tensorflow/python/eager/lift_to_graph.py index a25aa3f1973..a1c297e2c6f 100644 --- a/tensorflow/python/eager/lift_to_graph.py +++ b/tensorflow/python/eager/lift_to_graph.py @@ -120,6 +120,8 @@ def _copy_non_source(op, graph, op_map, base_graph): if f is not None and compat.as_str(f.name) not in graph._functions: f.add_to_graph(graph) # pylint: enable=protected-access + + # Create a new op in the destination graph if it doesn't exist before. copied_op = graph.create_op( op_type=op.type, inputs=copied_inputs, @@ -200,9 +202,14 @@ def _copy_source(s, graph, op_map, handle_captures, inverse_captures, op_map[s.op] = copied_placeholder.op -def lift_to_graph(init_tensors, graph, sources=None, - disallowed_placeholders=None, add_sources=False, - handle_captures=False, base_graph=None): +def lift_to_graph(init_tensors, + graph, + sources=None, + disallowed_placeholders=None, + add_sources=False, + handle_captures=False, + base_graph=None, + op_map=None): """Copies the tensor and all its inputs recursively to the outer graph. Args: @@ -218,6 +225,8 @@ def lift_to_graph(init_tensors, graph, sources=None, graph or simply create a vanilla placeholder. base_graph: The graph from which to lift ops. This will be inferred if not specified. + op_map: A map contains all the existing nodes that have been lifted to the + destination graph, so they won't be lifted and copied again. Returns: A mapping from ops in the current default graph to ops in `graph`. @@ -229,6 +238,7 @@ def lift_to_graph(init_tensors, graph, sources=None, i, resource_variable_ops.ResourceVariable)} init_tensors = set(init_tensors).difference(variable_init_tensors) base_graph = base_graph or list(init_tensors)[0].graph + op_map = op_map or {} # Check that the initializer does not depend on any placeholders. sources = set(sources or []) @@ -287,7 +297,8 @@ def lift_to_graph(init_tensors, graph, sources=None, # ends in the initializer. We copy those to the outermost graph and # build the initialization op there. with graph.as_default(): - op_map = {i: i for i in variable_init_tensors} # Pass through variables. + op_map.update({i: i for i in variable_init_tensors + }) # Pass through variables. source_ops = set() # Add the sources in the same order as the original graph. for s in six.itervalues(captures): @@ -314,7 +325,7 @@ def lift_to_graph(init_tensors, graph, sources=None, input_mutations = [] control_mutations = [] for op in reversed(ops_to_copy): - if op in source_ops: + if op in source_ops or op in op_map: continue new_input_mutations, new_control_mutations = _copy_non_source( op=op, graph=graph, op_map=op_map, base_graph=base_graph) From df6ba21e45e194e9465f19ffb98f4dc6fe15e9bc Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Mon, 22 Jul 2019 18:52:24 -0700 Subject: [PATCH 0369/3053] [XLA GPU] [NFC] Simplify IrEmitterUnnested::EmitKernel function The EmitKernel function is very complex, and a large amount of the complexity is brought by the machinery required for the 021 shared memory transposition. However, 021 transposition is only used by the EmitHlo021Tile user, and not by the reduction emitter. This CL achieves considerate logic simplification by moving the required machinery into the callback passed by EmitHlo021Tile, thus making EmitKernel simpler. PiperOrigin-RevId: 259452012 --- .../xla/service/gpu/ir_emitter_unnested.cc | 341 ++++++++---------- .../xla/service/gpu/ir_emitter_unnested.h | 39 +- 2 files changed, 161 insertions(+), 219 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 51c34371b00..c10f5b99b6a 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -98,10 +98,6 @@ namespace xla { namespace gpu { using llvm_ir::KernelMappingScheme; -using EmitElementFunction = - std::function; - namespace { using absl::InlinedVector; @@ -2200,41 +2196,6 @@ Status IrEmitterUnnested::EmitTargetElementLoop( return emit_status; } -std::vector IrEmitterUnnested::ConstructIrArrayForInputs( - const HloInstruction& hlo) { - std::vector param_arrays; - param_arrays.reserve(hlo.operands().size()); - for (const HloInstruction* param : hlo.operands()) { - param_arrays.push_back(GetIrArray(*param, hlo)); - } - return param_arrays; -} - -int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape( - const HloInstruction& hlo, const std::vector& param_arrays, - const std::vector& param_buffers, - absl::Span reduced_output_dims, - std::vector* param_reduced_shapes, - std::vector* param_in_reduced_shape_arrays) { - int64 num_params = hlo.operands().size(); - param_in_reduced_shape_arrays->reserve(num_params); - param_reduced_shapes->reserve(num_params); - for (int64 id = 0; id < num_params; ++id) { - if (param_buffers[id] == nullptr) { - param_reduced_shapes->push_back(Shape()); - param_in_reduced_shape_arrays->push_back(IrArray()); - continue; - } - const HloInstruction* param = hlo.operand(id); - param_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout( - param->shape().element_type(), - Permute({0, 2, 1}, reduced_output_dims))); - param_in_reduced_shape_arrays->push_back( - param_arrays[id].CastToShape((*param_reduced_shapes)[id], &b_)); - } - return num_params; -} - namespace { std::tuple GetStartOffsetAndStepForX( @@ -2254,12 +2215,12 @@ std::tuple GetStartOffsetAndStepForX( return std::make_tuple(start_offset_x, step_x); } -void EmitFullElementalTile(const KernelMappingScheme* mapping_scheme, - const IrArray::Index& tile_origin_index, - const string& loop_name, KernelSupportLibrary* ksl, - llvm::IRBuilder<>* builder, llvm::Value* y, - llvm::Value* x, llvm::Type* index_ty, - const EmitElementFunction& emit_elem_function) { +void EmitFullElementalTile( + const KernelMappingScheme* mapping_scheme, + const IrArray::Index& tile_origin_index, const string& loop_name, + KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y, + llvm::Value* x, llvm::Type* index_ty, + const IrEmitterUnnested::EmitElementFunction& emit_elem_function) { int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX(); int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY(); int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX(); @@ -2292,14 +2253,13 @@ void EmitFullElementalTile(const KernelMappingScheme* mapping_scheme, }); } -void EmitPartialElementalTile(const KernelMappingScheme* mapping_scheme, - const IrArray::Index& tile_origin_index, - const string& loop_name, - KernelSupportLibrary* ksl, - llvm::IRBuilder<>* builder, llvm::Value* y, - llvm::Value* x, llvm::Value* tile_height, - llvm::Value* tile_width, llvm::Type* index_ty, - const EmitElementFunction& emit_elem_function) { +void EmitPartialElementalTile( + const KernelMappingScheme* mapping_scheme, + const IrArray::Index& tile_origin_index, const string& loop_name, + KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y, + llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width, + llvm::Type* index_ty, + const IrEmitterUnnested::EmitElementFunction& emit_elem_function) { int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX(); int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY(); int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX(); @@ -2361,7 +2321,7 @@ void EmitTiledElementalCodeWithBoundsCheck( const IrArray::Index& tile_origin_index, const string& loop_name, KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y, llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width, - const EmitElementFunction& emit_elem_function) { + const IrEmitterUnnested::EmitElementFunction& emit_elem_function) { int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX(); int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY(); llvm::Type* index_ty = tile_width->getType(); @@ -2938,10 +2898,10 @@ void IrEmitterUnnested::EmitTileElementForReduction( } // Emits a kernel for the hlo instruction using the given tiling scheme. -void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile, - KernelCodegenInfo* kernel_info, +void IrEmitterUnnested::EmitBlock(KernelCodegenInfo* kernel_info, KernelSupportLibrary* ksl, - llvm::Type* index_ty) { + llvm::Type* index_ty, + TileGenerator emit_one_tile) { KernelMappingScheme* mapping_scheme = kernel_info->GetKernelMappingScheme(); absl::Span dims_in_tile = mapping_scheme->GetDimensionsInTiles(); absl::Span dims_in_block = @@ -2986,8 +2946,6 @@ void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile, absl::Span reduced_dims = mapping_scheme->GetDimensionsInElements(); - const bool block_contains_multi_tiles = - mapping_scheme->GetNumberOfTilesInOneBlock() > 1; // Emit the tile with a given tile_index, by calculating the tight bounds for // each dimension of the tile and then calling emit_one_tile. @@ -3008,7 +2966,7 @@ void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile, IrArray::Index tile_origin = mapping_scheme->GetElementIndexForTileOrigin(tile_index); - emit_one_tile(tile_origin, output_tile_bounds, block_contains_multi_tiles); + emit_one_tile(tile_origin, output_tile_bounds); }; const IrArray::Index starting_block = @@ -3051,40 +3009,17 @@ LaunchDimensions IrEmitterUnnested::EmitKernel( const KernelCodeGenerator& kernel_generator, KernelCodegenInfo* kernel_info) { KernelMappingScheme* mapping_scheme = kernel_info->GetKernelMappingScheme(); - - std::vector param_arrays = ConstructIrArrayForInputs(*unnested_hlo); - int64 num_params = param_arrays.size(); - // Allocate shared memory buffers to store the tiled inputs. - std::vector param_shmem_buffers(num_params, nullptr); - for (int64 id : tiled_param_ids) { - const HloInstruction* param = unnested_hlo->operand(id); - param_shmem_buffers[id] = - mapping_scheme->GetSharedMemoryBufferForElementType( - llvm_ir::PrimitiveTypeToIrType(param->shape().element_type(), - module_), - IrName(unnested_hlo, StrCat("tile", id))); - VLOG(3) << "Added shmem buffer for parameter " << id << ": " - << llvm_ir::DumpToString(*param_shmem_buffers[id]); - } - - auto reduction_info = dynamic_cast(kernel_info); - bool is_column_reduction = - (reduction_info && !reduction_info->IsRowReduction()); - LaunchDimensions launch_dimensions(mapping_scheme->GetNumberOfBlocks(), mapping_scheme->GetThreadsPerBlock()); // TODO(b/110211620): Enable int32 index type for column reduction. + auto reduction_info = dynamic_cast(kernel_info); llvm::Type* index_ty = - is_column_reduction + (reduction_info && !reduction_info->IsRowReduction()) ? b_.getInt64Ty() : GetIndexTypeForKernel(unnested_hlo, launch_dimensions.launch_bound(), &b_); - auto index_typed_constant = [&](uint64 c) -> llvm::Constant* { - return llvm::ConstantInt::get(index_ty, c); - }; - // For multioutput fusion, one thread needs to output a tuple with pointers to // all the individual outputs. We could do this at any point in the kernel, // but we do it at the beginning in the hopes of reducing register pressure, @@ -3097,17 +3032,6 @@ LaunchDimensions IrEmitterUnnested::EmitKernel( }); } - // For each tiled parameter, cast its input IrArray to the corresponding - // reduced shape and keep the reduced shape live during IR emission. - std::vector param_in_reduced_shape_arrays; - std::vector param_reduced_shapes; - absl::Span reduced_dims = - mapping_scheme->GetDimensionsInElements(); - int num_shapes = ConstructInputReducedShapeAndCastInputIrArrayToShape( - *unnested_hlo, param_arrays, param_shmem_buffers, reduced_dims, - ¶m_reduced_shapes, ¶m_in_reduced_shape_arrays); - DCHECK_EQ(num_shapes, num_params); - // Calculate the starting element coordinate within a tile for the current // thread, (y, x) from thread_id. llvm::Value* x; @@ -3118,81 +3042,21 @@ LaunchDimensions IrEmitterUnnested::EmitKernel( mapping_scheme->GetNumberOfThreadsForDimensionX() == kWarpSize ? x : nullptr); kernel_info->SetIndexType(index_ty); - KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll); - // Curry a few parameters to EmitTiledElementalCodeWithBoundsCheck. - auto emit_tiled_elemental_code_with_bounds_check = - [&](const IrArray::Index& index, const string& loop_name, - llvm::Value* tile_height, llvm::Value* tile_width, - const EmitElementFunction& emit_elem_function) { - EmitTiledElementalCodeWithBoundsCheck(mapping_scheme, index, loop_name, - &ksl, &b_, y, x, tile_height, - tile_width, emit_elem_function); - }; - - auto emit_one_tile = [&](const IrArray::Index& output_tile_origin, - absl::Span output_tile_bounds, - bool block_contains_multi_tiles) { - // Calculate the input tile origin from the output tile origin. - const IrArray::Index input_tile_origin( - Permute({0, 2, 1}, output_tile_origin.multidim()), - Permute({0, 2, 1}, output_tile_origin.dims()), - output_tile_origin.GetType()); - - // If shared memory transpose is needed, wait for all threads to reach this - // point, lest we copy a value from tile to output before the other thread - // copies it from input to tile. This is `__syncthreads` in CUDA. - if (!tiled_param_ids.empty()) { - // Copy input parameter values to shared memory buffers: - // tile[y, x] = input[index] - // Note that tile_width and tile_height are flipped here because we are - // reading a transposed tile. - emit_tiled_elemental_code_with_bounds_check( - input_tile_origin, "input", output_tile_bounds[2], - output_tile_bounds[1], - [&](const IrArray::Index& index, llvm::Value* y_loc, - llvm::Value* x_loc, int64 /*x_iter_num*/) { - for (int64 id : tiled_param_ids) { - IrArray& input_in_logical_shape = - param_in_reduced_shape_arrays[id]; - llvm::Value* shmem_buffer = param_shmem_buffers[id]; - // TODO(jlebar): Add AA metadata to this store. Tile buffers are - // global variables, so LLVM can't infer much about it. - Store(input_in_logical_shape.EmitReadArrayElement( - index, &b_, "input_element"), - GEP(shmem_buffer, {index_typed_constant(0), y_loc, x_loc})); - } - }); - - // Wait for all threads to reach this point using `__syncthreads` in CUDA. - EmitCallToTargetIntrinsic(TargetIntrinsicID::kBarrierId, {}, {}, &b_); - } - - llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x); - kernel_info->SetTiledParamInfo(&tiled_param_info); - - // Write to output[index] by emitting code like normal, except that values - // for the tiled parameters are read from the shmem buffers. - emit_tiled_elemental_code_with_bounds_check( - output_tile_origin, "output", output_tile_bounds[1], - output_tile_bounds[2], - [&](const IrArray::Index& index, llvm::Value* y_loc, llvm::Value* x_loc, - int64 x_iter_num) { - kernel_generator.GetTileElementGenerator()( - unnested_hlo, index, kernel_info, y_loc, x_loc, x_iter_num); - }); - - // If a tile block contains multiple tiles and shared memory buffers are - // used, we need to wait for all threads to finish using the shared memory - // buffer for the current tile before we move on to process the next tile - // and overwrite the shared memory buffers. - if (block_contains_multi_tiles && !tiled_param_ids.empty()) { - EmitCallToTargetIntrinsic(TargetIntrinsicID::kBarrierId, {}, {}, &b_); - } - }; kernel_generator.GetBlockPrologueGenerator()(unnested_hlo, kernel_info); - EmitBlock(std::move(emit_one_tile), kernel_info, &ksl, index_ty); + EmitBlock(kernel_info, &ksl, index_ty, + [&](const IrArray::Index& output_tile_origin, + absl::Span output_tile_bounds) { + std::vector param_shmem_buffers( + unnested_hlo->operand_count(), nullptr); + llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, + y, x); + kernel_info->SetTiledParamInfo(&tiled_param_info); + kernel_generator.GetTileElementGenerator()( + y, x, output_tile_origin, "output", output_tile_bounds[1], + output_tile_bounds[2], &ksl); + }); kernel_generator.GetBlockEpilogueGenerator()(unnested_hlo, kernel_info); return launch_dimensions; } @@ -3230,27 +3094,110 @@ LaunchDimensions IrEmitterUnnested::EmitHlo021Tile( /*tile_size_x=*/kWarpSize, /*req_block_sizes=*/{1, 1, 1}, /*num_threads_y=*/kNumRows, /*num_threads_x=*/kWarpSize, &b_); - TileElementGenerator element_generator; - if (hlo->opcode() == HloOpcode::kCopy) { - element_generator = [&](HloInstruction* hlo, - const llvm_ir::IrArray::Index& index, - const KernelCodegenInfo* kernel_info, - llvm::Value* y_loc, llvm::Value* x_loc, - int64 x_iter_num) { - EmitTileElementForCopy(hlo, index, kernel_info, y_loc, x_loc, x_iter_num); - }; - } else { - DCHECK_EQ(hlo->opcode(), HloOpcode::kFusion); - element_generator = - [&](HloInstruction* hlo, const llvm_ir::IrArray::Index& index, - const KernelCodegenInfo* kernel_info, llvm::Value* y_loc, - llvm::Value* x_loc, int64 x_iter_num) { - EmitTileElementForFusion(hlo, index, kernel_info, y_loc, x_loc, - x_iter_num); - }; - } KernelCodegenInfo kernel_info(&mapping_scheme); - KernelCodeGenerator kernel_generator(std::move(element_generator)); + + std::vector param_arrays; + + // For each tiled parameter, cast its input IrArray to the corresponding + // reduced shape and keep the reduced shape live during IR emission. + std::vector param_in_reduced_shape_arrays; + std::vector param_shmem_buffers(hlo->operand_count(), nullptr); + + for (int64 id = 0; id < hlo->operand_count(); id++) { + const HloInstruction* param = hlo->operand(id); + param_arrays.push_back(GetIrArray(*param, *hlo)); + + if (absl::c_linear_search(tiled_param_ids, id)) { + param_shmem_buffers[id] = + mapping_scheme.GetSharedMemoryBufferForElementType( + llvm_ir::PrimitiveTypeToIrType(param->shape().element_type(), + module_), + IrName(hlo, StrCat("tile", id))); + VLOG(3) << "Added shmem buffer for parameter " << id << ": " + << llvm_ir::DumpToString(*param_shmem_buffers[id]); + Shape reduced_shape = ShapeUtil::MakeShapeWithDescendingLayout( + param->shape().element_type(), + Permute({0, 2, 1}, reduced_output_dims)); + LOG(ERROR) << "Generated shape: " << reduced_shape.ToString(true); + param_in_reduced_shape_arrays.push_back( + param_arrays[id].CastToShape(reduced_shape, &b_)); + } else { + param_in_reduced_shape_arrays.push_back(IrArray()); + } + } + + EmitElementFunction element_generator = + [&](const llvm_ir::IrArray::Index& index, llvm::Value* y_loc, + llvm::Value* x_loc, int64 x_iter_num) { + if (hlo->opcode() == HloOpcode::kCopy) { + EmitTileElementForCopy(hlo, index, &kernel_info, y_loc, x_loc, + x_iter_num); + } else { + CHECK_EQ(hlo->opcode(), HloOpcode::kFusion); + EmitTileElementForFusion(hlo, index, &kernel_info, y_loc, x_loc, + x_iter_num); + } + }; + + KernelCodeGenerator kernel_generator( + [&](llvm::Value* y, llvm::Value* x, const IrArray::Index& index, + const string& loop_name, llvm::Value* tile_height, + llvm::Value* tile_width, KernelSupportLibrary* ksl) { + llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x); + kernel_info.SetTiledParamInfo(&tiled_param_info); + + // If shared memory transpose is needed, wait for all threads to reach + // this point, lest we copy a value from tile to output before the other + // thread copies it from input to tile. This is `__syncthreads` in CUDA. + if (!tiled_param_ids.empty()) { + // Calculate the input tile origin from the output tile origin. + const IrArray::Index input_tile_origin( + Permute({0, 2, 1}, index.multidim()), + Permute({0, 2, 1}, index.dims()), index.GetType()); + + // Copy input parameter values to shared memory buffers: + // tile[y, x] = input[index] + // Note that tile_width and tile_height are flipped here because we + // are reading a transposed tile. + EmitTiledElementalCodeWithBoundsCheck( + &mapping_scheme, input_tile_origin, "input", ksl, &b_, y, x, + tile_width, tile_height, + [&](const IrArray::Index& index, llvm::Value* y_loc, + llvm::Value* x_loc, int64 /*x_iter_num*/) { + for (int64 id : tiled_param_ids) { + IrArray& input_in_logical_shape = + param_in_reduced_shape_arrays[id]; + + llvm::Value* shmem_buffer = param_shmem_buffers[id]; + llvm::Value* zero = + llvm::ConstantInt::get(kernel_info.GetIndexType(), 0); + // TODO(jlebar): Add AA metadata to this store. Tile buffers + // are global variables, so LLVM can't infer much about it. + Store(input_in_logical_shape.EmitReadArrayElement( + index, &b_, "input_element"), + GEP(shmem_buffer, {zero, y_loc, x_loc})); + } + }); + + // Wait for all threads to reach this point using `__syncthreads` in + // CUDA. + EmitCallToTargetIntrinsic(TargetIntrinsicID::kBarrierId, {}, {}, &b_); + } + + EmitTiledElementalCodeWithBoundsCheck(&mapping_scheme, index, loop_name, + ksl, &b_, y, x, tile_height, + tile_width, element_generator); + bool block_contains_multi_tiles = + mapping_scheme.GetNumberOfTilesInOneBlock() > 1; + + // If a tile block contains multiple tiles and shared memory buffers are + // used, we need to wait for all threads to finish using the shared + // memory buffer for the current tile before we move on to process the + // next tile and overwrite the shared memory buffers. + if (block_contains_multi_tiles && !tiled_param_ids.empty()) { + EmitCallToTargetIntrinsic(TargetIntrinsicID::kBarrierId, {}, {}, &b_); + } + }); return EmitKernel(hlo, tiled_param_ids, kernel_generator, &kernel_info); } @@ -3679,13 +3626,21 @@ Status IrEmitterUnnested::EmitReductionFromOrToContiguousDimensions( std::tie(mapping_scheme, is_row_reduction) = ComputeMappingSchemeAndReductionKind(unnested_hlo, first_reduce); ReductionCodegenInfo reduction_info(&mapping_scheme, is_row_reduction); + EmitElementFunction emit_reduction_tile = + [&](const llvm_ir::IrArray::Index& index, llvm::Value* y_loc, + llvm::Value* x_loc, int64 x_iter_num) { + EmitTileElementForReduction(unnested_hlo, index, &reduction_info, y_loc, + x_loc, x_iter_num); + }; + KernelCodeGenerator kernel_generator( /*tile_element_generator=*/ - [&](HloInstruction* hlo, const llvm_ir::IrArray::Index& index, - const KernelCodegenInfo* kernel_info, llvm::Value* y_loc, - llvm::Value* x_loc, int64 x_iter_num) { - EmitTileElementForReduction(hlo, index, kernel_info, y_loc, x_loc, - x_iter_num); + [&](llvm::Value* y, llvm::Value* x, const IrArray::Index& index, + const string& loop_name, llvm::Value* tile_height, + llvm::Value* tile_width, KernelSupportLibrary* ksl) { + EmitTiledElementalCodeWithBoundsCheck(&mapping_scheme, index, loop_name, + ksl, &b_, y, x, tile_height, + tile_width, emit_reduction_tile); }, /*block_prologue_generator=*/ [&](HloInstruction* hlo, KernelCodegenInfo* kernel_info) { diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h index 0e3700fc59c..514de5aceb7 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h @@ -55,8 +55,7 @@ class IrEmitterUnnested : public IrEmitter { // to a global result to implement reduction. using TileGenerator = std::function output_tile_bounds, - bool block_contains_multi_tiles)>; + absl::Span output_tile_bounds)>; // KernelCodegenInfo records the common information to support the code // generation for a kernel to process tensor elements by blocks. A block of // tensor elements may contain one or multiple tiles. The code generators that @@ -101,6 +100,7 @@ class IrEmitterUnnested : public IrEmitter { // A function object to finalize the code generation for a tile block. using BlockEpilogueGenerator = std::function; + // A function object to generate code to process one element in a tile. // // hlo: the instruction for which the code is generated for. @@ -110,11 +110,15 @@ class IrEmitterUnnested : public IrEmitter { // kernel_info: Other information to support the kernel code generation. // x_iter_num: When a thread process N elements in the X dimension, x_iter_num // has a value of 0..N-1 to identify the element being process. - using TileElementGenerator = std::function; + using TileElementGenerator = std::function; + // KernelCodeGenerator records the code generator objects that generate code // for tile elements or tile block prologue/epilogue. class KernelCodeGenerator { @@ -255,9 +259,10 @@ class IrEmitterUnnested : public IrEmitter { absl::Span param_ids, const KernelCodeGenerator& kernel_generator, KernelCodegenInfo* kernel_info); - void EmitBlock(const TileGenerator& emit_one_tile, - KernelCodegenInfo* kernel_info, KernelSupportLibrary* ksl, - llvm::Type* index_ty); + + void EmitBlock(KernelCodegenInfo* kernel_info, KernelSupportLibrary* ksl, + llvm::Type* index_ty, TileGenerator emit_one_tile); + // Emits code to process a tensor element in a tile for the given kCopy HLO // that performs a 0-2-1 transpose. void EmitTileElementForCopy(HloInstruction* hlo, @@ -296,24 +301,6 @@ class IrEmitterUnnested : public IrEmitter { absl::Span reducers, absl::Span partial_result_addresses); - // Generates the IrArray for each input of an hlo and returns a vector that - // constains such IrArrays. - std::vector ConstructIrArrayForInputs( - const HloInstruction& hlo); - - // For each input of the `hlo` instruction, checks its value in - // `param_buffers` to find out whether the input has a reduced shape. If the - // input has a reduced shape, constructs the reduced shape for the input and - // casts the original input IrArray in `param_arrays` to the reduced shape. - // Return the total number of inputs. - int ConstructInputReducedShapeAndCastInputIrArrayToShape( - const HloInstruction& hlo, - const std::vector& param_arrays, - const std::vector& param_buffers, - absl::Span reduced_output_dims, - std::vector* param_reduced_shapes, - std::vector* param_in_reduced_shape_arrays); - // Returns a KernelThunk that invokes the kernel emitted for `inst`. The // caller needs to make sure `inst` outlives the lifetime of the returned // Thunk object. The kernel implementation will be unrolled if unroll_factor From 95bcd434d043359478679c2f9fdde69bcd0e8c82 Mon Sep 17 00:00:00 2001 From: Ian Langmore Date: Mon, 22 Jul 2019 19:05:06 -0700 Subject: [PATCH 0370/3053] Make non-meta linear operators (other than Circulant/Toeplitz) tape safe. PiperOrigin-RevId: 259453506 --- .../python/kernel_tests/wishart_test.py | 2 +- .../linalg/linear_operator_block_diag_test.py | 23 ++ .../linear_operator_householder_test.py | 17 + .../linalg/linear_operator_identity_test.py | 148 ++++----- .../linear_operator_lower_triangular_test.py | 9 + .../linalg/linear_operator_util_test.py | 299 +++++++----------- .../linalg/linear_operator_zeros_test.py | 81 ++--- .../ops/linalg/linear_operator_householder.py | 8 +- .../ops/linalg/linear_operator_identity.py | 39 +-- .../linear_operator_lower_triangular.py | 41 +-- .../ops/linalg/linear_operator_test_util.py | 57 +++- .../python/ops/linalg/linear_operator_util.py | 10 +- .../ops/linalg/linear_operator_zeros.py | 4 + 13 files changed, 388 insertions(+), 350 deletions(-) diff --git a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py index cdee30bbc42..c924a22c290 100644 --- a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py +++ b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py @@ -382,7 +382,7 @@ class WishartCholeskyTest(test.TestCase): with self.assertRaisesRegexp(ValueError, "cannot be less than"): distributions.WishartCholesky( df=2, scale=chol_scale, validate_args=False) - with self.assertRaisesRegexp(TypeError, "Argument tril must have dtype"): + with self.assertRaisesRegexp(TypeError, "."): distributions.WishartCholesky( df=4., scale=np.asarray( diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py index a00e61c09dd..6a7c4362f5c 100644 --- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py +++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py @@ -20,7 +20,9 @@ from __future__ import print_function import numpy as np from tensorflow.python.framework import dtypes +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops +from tensorflow.python.ops import variables as variables_module from tensorflow.python.ops.linalg import linalg as linalg_lib from tensorflow.python.ops.linalg import linear_operator_block_diag as block_diag from tensorflow.python.ops.linalg import linear_operator_lower_triangular as lower_triangular @@ -56,6 +58,7 @@ def _block_diag_dense(expected_shape, blocks): return array_ops.concat(rows, axis=-2) +@test_util.run_all_in_graph_and_eager_modes class SquareLinearOperatorBlockDiagTest( linear_operator_test_util.SquareLinearOperatorDerivedClassTest): """Most tests done in the base class LinearOperatorDerivedClassTest.""" @@ -209,6 +212,26 @@ class SquareLinearOperatorBlockDiagTest( block_diag.LinearOperatorBlockDiag) self.assertEqual(2, len(inverse.operators)) + def test_tape_safe(self): + matrix = variables_module.Variable([[1., 0.], [0., 1.]]) + operator = block_diag.LinearOperatorBlockDiag( + [ + linalg.LinearOperatorFullMatrix( + matrix, + is_self_adjoint=True, + is_positive_definite=True, + ), + linalg.LinearOperatorFullMatrix( + matrix, + is_self_adjoint=True, + is_positive_definite=True, + ), + ], + is_self_adjoint=True, + is_positive_definite=True, + ) + self.check_tape_safe(operator) + def test_is_non_singular_auto_set(self): # Matrix with two positive eigenvalues, 11 and 8. # The matrix values do not effect auto-setting of the flags. diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py index 5f435764945..b333dbf6ff4 100644 --- a/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py +++ b/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py @@ -17,17 +17,21 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import linalg_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import variables as variables_module from tensorflow.python.ops.linalg import linalg as linalg_lib from tensorflow.python.ops.linalg import linear_operator_householder as householder from tensorflow.python.ops.linalg import linear_operator_test_util from tensorflow.python.platform import test linalg = linalg_lib +CheckTapeSafeSkipOptions = linear_operator_test_util.CheckTapeSafeSkipOptions +@test_util.run_all_in_graph_and_eager_modes class LinearOperatorHouseholderTest( linear_operator_test_util.SquareLinearOperatorDerivedClassTest): """Most tests done in the base class LinearOperatorDerivedClassTest.""" @@ -87,6 +91,19 @@ class LinearOperatorHouseholderTest( self.assertIsInstance( operator.inverse(), householder.LinearOperatorHouseholder) + def test_tape_safe(self): + reflection_axis = variables_module.Variable([1., 3., 5., 8.]) + operator = householder.LinearOperatorHouseholder(reflection_axis) + self.check_tape_safe( + operator, + skip_options=[ + # Determinant hard-coded as 1. + CheckTapeSafeSkipOptions.DETERMINANT, + CheckTapeSafeSkipOptions.LOG_ABS_DETERMINANT, + # Trace hard-coded. + CheckTapeSafeSkipOptions.TRACE, + ]) + if __name__ == "__main__": linear_operator_test_util.add_tests(LinearOperatorHouseholderTest) diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py index 3d29adc143f..18e8ccfd74d 100644 --- a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py +++ b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py @@ -25,6 +25,7 @@ from tensorflow.python.ops import array_ops from tensorflow.python.ops import linalg_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops +from tensorflow.python.ops import variables as variables_module from tensorflow.python.ops.linalg import linalg as linalg_lib from tensorflow.python.ops.linalg import linear_operator_test_util from tensorflow.python.platform import test @@ -33,6 +34,7 @@ from tensorflow.python.platform import test rng = np.random.RandomState(2016) +@test_util.run_all_in_graph_and_eager_modes class LinearOperatorIdentityTest( linear_operator_test_util.SquareLinearOperatorDerivedClassTest): """Most tests done in the base class LinearOperatorDerivedClassTest.""" @@ -61,23 +63,20 @@ class LinearOperatorIdentityTest( return operator, mat - @test_util.run_deprecated_v1 def test_assert_positive_definite(self): with self.cached_session(): operator = linalg_lib.LinearOperatorIdentity(num_rows=2) - operator.assert_positive_definite().run() # Should not fail + self.evaluate(operator.assert_positive_definite()) # Should not fail - @test_util.run_deprecated_v1 def test_assert_non_singular(self): with self.cached_session(): operator = linalg_lib.LinearOperatorIdentity(num_rows=2) - operator.assert_non_singular().run() # Should not fail + self.evaluate(operator.assert_non_singular()) # Should not fail - @test_util.run_deprecated_v1 def test_assert_self_adjoint(self): with self.cached_session(): operator = linalg_lib.LinearOperatorIdentity(num_rows=2) - operator.assert_self_adjoint().run() # Should not fail + self.evaluate(operator.assert_self_adjoint()) # Should not fail def test_float16_matmul(self): # float16 cannot be tested by base test class because tf.linalg.solve does @@ -113,41 +112,38 @@ class LinearOperatorIdentityTest( with self.assertRaisesRegexp(ValueError, "must be non-negative"): linalg_lib.LinearOperatorIdentity(num_rows=2, batch_shape=[-2]) - @test_util.run_deprecated_v1 def test_non_scalar_num_rows_raises_dynamic(self): with self.cached_session(): - num_rows = array_ops.placeholder(dtypes.int32) - operator = linalg_lib.LinearOperatorIdentity( - num_rows, assert_proper_shapes=True) - with self.assertRaisesOpError("must be a 0-D Tensor"): - operator.to_dense().eval(feed_dict={num_rows: [2]}) + num_rows = array_ops.placeholder_with_default([2], shape=None) + + with self.assertRaisesError("must be a 0-D Tensor"): + operator = linalg_lib.LinearOperatorIdentity( + num_rows, assert_proper_shapes=True) + self.evaluate(operator.to_dense()) - @test_util.run_deprecated_v1 def test_negative_num_rows_raises_dynamic(self): with self.cached_session(): - num_rows = array_ops.placeholder(dtypes.int32) - operator = linalg_lib.LinearOperatorIdentity( - num_rows, assert_proper_shapes=True) - with self.assertRaisesOpError("must be non-negative"): - operator.to_dense().eval(feed_dict={num_rows: -2}) + num_rows = array_ops.placeholder_with_default(-2, shape=None) + with self.assertRaisesError("must be non-negative"): + operator = linalg_lib.LinearOperatorIdentity( + num_rows, assert_proper_shapes=True) + self.evaluate(operator.to_dense()) - @test_util.run_deprecated_v1 def test_non_1d_batch_shape_raises_dynamic(self): with self.cached_session(): - batch_shape = array_ops.placeholder(dtypes.int32) - operator = linalg_lib.LinearOperatorIdentity( - num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True) - with self.assertRaisesOpError("must be a 1-D"): - operator.to_dense().eval(feed_dict={batch_shape: 2}) + batch_shape = array_ops.placeholder_with_default(2, shape=None) + with self.assertRaisesError("must be a 1-D"): + operator = linalg_lib.LinearOperatorIdentity( + num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True) + self.evaluate(operator.to_dense()) - @test_util.run_deprecated_v1 def test_negative_batch_shape_raises_dynamic(self): with self.cached_session(): - batch_shape = array_ops.placeholder(dtypes.int32) - operator = linalg_lib.LinearOperatorIdentity( - num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True) - with self.assertRaisesOpError("must be non-negative"): - operator.to_dense().eval(feed_dict={batch_shape: [-2]}) + batch_shape = array_ops.placeholder_with_default([-2], shape=None) + with self.assertRaisesError("must be non-negative"): + operator = linalg_lib.LinearOperatorIdentity( + num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True) + self.evaluate(operator.to_dense()) def test_wrong_matrix_dimensions_raises_static(self): operator = linalg_lib.LinearOperatorIdentity(num_rows=2) @@ -155,17 +151,16 @@ class LinearOperatorIdentityTest( with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"): operator.matmul(x) - @test_util.run_deprecated_v1 def test_wrong_matrix_dimensions_raises_dynamic(self): - num_rows = array_ops.placeholder(dtypes.int32) - x = array_ops.placeholder(dtypes.float32) + num_rows = array_ops.placeholder_with_default(2, shape=None) + x = array_ops.placeholder_with_default( + rng.rand(3, 3).astype(np.float32), shape=None) with self.cached_session(): - operator = linalg_lib.LinearOperatorIdentity( - num_rows, assert_proper_shapes=True) - y = operator.matmul(x) - with self.assertRaisesOpError("Incompatible.*dimensions"): - y.eval(feed_dict={num_rows: 2, x: rng.rand(3, 3)}) + with self.assertRaisesError("Dimensions.*not.compatible"): + operator = linalg_lib.LinearOperatorIdentity( + num_rows, assert_proper_shapes=True) + self.evaluate(operator.matmul(x)) def test_default_batch_shape_broadcasts_with_everything_static(self): # These cannot be done in the automated (base test class) tests since they @@ -181,22 +176,18 @@ class LinearOperatorIdentityTest( self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape()) self.assertAllClose(*self.evaluate([operator_matmul, expected])) - @test_util.run_deprecated_v1 def test_default_batch_shape_broadcasts_with_everything_dynamic(self): # These cannot be done in the automated (base test class) tests since they # test shapes that tf.batch_matmul cannot handle. # In particular, tf.batch_matmul does not broadcast. - with self.cached_session() as sess: - x = array_ops.placeholder(dtypes.float32) + with self.cached_session(): + x = array_ops.placeholder_with_default(rng.randn(1, 2, 3, 4), shape=None) operator = linalg_lib.LinearOperatorIdentity(num_rows=3, dtype=x.dtype) operator_matmul = operator.matmul(x) expected = x - feed_dict = {x: rng.randn(1, 2, 3, 4)} - - self.assertAllClose( - *sess.run([operator_matmul, expected], feed_dict=feed_dict)) + self.assertAllClose(*self.evaluate([operator_matmul, expected])) def test_broadcast_matmul_static_shapes(self): # These cannot be done in the automated (base test class) tests since they @@ -219,21 +210,19 @@ class LinearOperatorIdentityTest( self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape()) self.assertAllClose(*self.evaluate([operator_matmul, expected])) - @test_util.run_deprecated_v1 def test_broadcast_matmul_dynamic_shapes(self): # These cannot be done in the automated (base test class) tests since they # test shapes that tf.batch_matmul cannot handle. # In particular, tf.batch_matmul does not broadcast. - with self.cached_session() as sess: + with self.cached_session(): # Given this x and LinearOperatorIdentity shape of (2, 1, 3, 3), the # broadcast shape of operator and 'x' is (2, 2, 3, 4) - x = array_ops.placeholder(dtypes.float32) - num_rows = array_ops.placeholder(dtypes.int32) - batch_shape = array_ops.placeholder(dtypes.int32) + x = array_ops.placeholder_with_default(rng.rand(1, 2, 3, 4), shape=None) + num_rows = array_ops.placeholder_with_default(3, shape=None) + batch_shape = array_ops.placeholder_with_default((2, 1), shape=None) operator = linalg_lib.LinearOperatorIdentity( - num_rows, batch_shape=batch_shape) - feed_dict = {x: rng.rand(1, 2, 3, 4), num_rows: 3, batch_shape: (2, 1)} + num_rows, batch_shape=batch_shape, dtype=dtypes.float64) # Batch matrix of zeros with the broadcast shape of x and operator. zeros = array_ops.zeros(shape=(2, 2, 3, 4), dtype=x.dtype) @@ -242,8 +231,7 @@ class LinearOperatorIdentityTest( expected = x + zeros operator_matmul = operator.matmul(x) - self.assertAllClose( - *sess.run([operator_matmul, expected], feed_dict=feed_dict)) + self.assertAllClose(*self.evaluate([operator_matmul, expected])) def test_is_x_flags(self): # The is_x flags are by default all True. @@ -280,7 +268,16 @@ class LinearOperatorIdentityTest( self.assertIsInstance( operator.inverse(), linalg_lib.LinearOperatorIdentity) + def test_ref_type_shape_args_raises(self): + with self.assertRaisesRegexp(TypeError, "num_rows.*reference"): + linalg_lib.LinearOperatorIdentity(num_rows=variables_module.Variable(2)) + with self.assertRaisesRegexp(TypeError, "batch_shape.*reference"): + linalg_lib.LinearOperatorIdentity( + num_rows=2, batch_shape=variables_module.Variable([3])) + + +@test_util.run_all_in_graph_and_eager_modes class LinearOperatorScaledIdentityTest( linear_operator_test_util.SquareLinearOperatorDerivedClassTest): """Most tests done in the base class LinearOperatorDerivedClassTest.""" @@ -331,47 +328,44 @@ class LinearOperatorScaledIdentityTest( return operator, matrix - @test_util.run_deprecated_v1 def test_assert_positive_definite_does_not_raise_when_positive(self): with self.cached_session(): operator = linalg_lib.LinearOperatorScaledIdentity( num_rows=2, multiplier=1.) - operator.assert_positive_definite().run() # Should not fail + self.evaluate(operator.assert_positive_definite()) # Should not fail def test_assert_positive_definite_raises_when_negative(self): with self.cached_session(): operator = linalg_lib.LinearOperatorScaledIdentity( num_rows=2, multiplier=-1.) with self.assertRaisesOpError("not positive definite"): - operator.assert_positive_definite().run() + self.evaluate(operator.assert_positive_definite()) - @test_util.run_deprecated_v1 def test_assert_non_singular_does_not_raise_when_non_singular(self): with self.cached_session(): operator = linalg_lib.LinearOperatorScaledIdentity( num_rows=2, multiplier=[1., 2., 3.]) - operator.assert_non_singular().run() # Should not fail + self.evaluate(operator.assert_non_singular()) # Should not fail def test_assert_non_singular_raises_when_singular(self): with self.cached_session(): operator = linalg_lib.LinearOperatorScaledIdentity( num_rows=2, multiplier=[1., 2., 0.]) with self.assertRaisesOpError("was singular"): - operator.assert_non_singular().run() + self.evaluate(operator.assert_non_singular()) - @test_util.run_deprecated_v1 def test_assert_self_adjoint_does_not_raise_when_self_adjoint(self): with self.cached_session(): operator = linalg_lib.LinearOperatorScaledIdentity( num_rows=2, multiplier=[1. + 0J]) - operator.assert_self_adjoint().run() # Should not fail + self.evaluate(operator.assert_self_adjoint()) # Should not fail def test_assert_self_adjoint_raises_when_not_self_adjoint(self): with self.cached_session(): operator = linalg_lib.LinearOperatorScaledIdentity( num_rows=2, multiplier=[1. + 1J]) with self.assertRaisesOpError("not self-adjoint"): - operator.assert_self_adjoint().run() + self.evaluate(operator.assert_self_adjoint()) def test_float16_matmul(self): # float16 cannot be tested by base test class because tf.linalg.solve does @@ -397,17 +391,18 @@ class LinearOperatorScaledIdentityTest( with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"): operator.matmul(x) - @test_util.run_deprecated_v1 def test_wrong_matrix_dimensions_raises_dynamic(self): - num_rows = array_ops.placeholder(dtypes.int32) - x = array_ops.placeholder(dtypes.float32) + num_rows = array_ops.placeholder_with_default(2, shape=None) + x = array_ops.placeholder_with_default( + rng.rand(3, 3).astype(np.float32), shape=None) with self.cached_session(): - operator = linalg_lib.LinearOperatorScaledIdentity( - num_rows, multiplier=[1., 2], assert_proper_shapes=True) - y = operator.matmul(x) - with self.assertRaisesOpError("Incompatible.*dimensions"): - y.eval(feed_dict={num_rows: 2, x: rng.rand(3, 3)}) + with self.assertRaisesError("Dimensions.*not.compatible"): + operator = linalg_lib.LinearOperatorScaledIdentity( + num_rows, + multiplier=[1., 2], + assert_proper_shapes=True) + self.evaluate(operator.matmul(x)) def test_broadcast_matmul_and_solve(self): # These cannot be done in the automated (base test class) tests since they @@ -530,6 +525,17 @@ class LinearOperatorScaledIdentityTest( operator.inverse(), linalg_lib.LinearOperatorScaledIdentity) + def test_ref_type_shape_args_raises(self): + with self.assertRaisesRegexp(TypeError, "num_rows.*reference"): + linalg_lib.LinearOperatorScaledIdentity( + num_rows=variables_module.Variable(2), multiplier=1.23) + + def test_tape_safe(self): + multiplier = variables_module.Variable(1.23) + operator = linalg_lib.LinearOperatorScaledIdentity( + num_rows=2, multiplier=multiplier) + self.check_tape_safe(operator) + if __name__ == "__main__": linear_operator_test_util.add_tests(LinearOperatorIdentityTest) diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py index c86beebf1f3..02ce5b810eb 100644 --- a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py +++ b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py @@ -17,8 +17,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import variables as variables_module from tensorflow.python.ops.linalg import linalg as linalg_lib from tensorflow.python.ops.linalg import linear_operator_test_util from tensorflow.python.platform import test @@ -26,6 +28,7 @@ from tensorflow.python.platform import test linalg = linalg_lib +@test_util.run_all_in_graph_and_eager_modes class LinearOperatorLowerTriangularTest( linear_operator_test_util.SquareLinearOperatorDerivedClassTest): """Most tests done in the base class LinearOperatorDerivedClassTest.""" @@ -101,6 +104,12 @@ class LinearOperatorLowerTriangularTest( operator1.to_dense()), self.evaluate(operator_matmul.to_dense())) + def test_tape_safe(self): + tril = variables_module.Variable([[1., 0.], [0., 1.]]) + operator = linalg_lib.LinearOperatorLowerTriangular( + tril, is_non_singular=True) + self.check_tape_safe(operator) + if __name__ == "__main__": linear_operator_test_util.add_tests(LinearOperatorLowerTriangularTest) diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py index 03086e64ecf..a8dfcdf2be6 100644 --- a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py +++ b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py @@ -20,9 +20,7 @@ from __future__ import print_function from absl.testing import parameterized import numpy as np -from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import linalg_ops from tensorflow.python.ops import math_ops @@ -34,66 +32,62 @@ rng = np.random.RandomState(0) class AssertZeroImagPartTest(test.TestCase): - @test_util.run_deprecated_v1 def test_real_tensor_doesnt_raise(self): x = ops.convert_to_tensor([0., 2, 3]) - with self.cached_session(): - # Should not raise. - linear_operator_util.assert_zero_imag_part(x, message="ABC123").run() + # Should not raise. + self.evaluate( + linear_operator_util.assert_zero_imag_part(x, message="ABC123")) - @test_util.run_deprecated_v1 def test_complex_tensor_with_imag_zero_doesnt_raise(self): x = ops.convert_to_tensor([1., 0, 3]) y = ops.convert_to_tensor([0., 0, 0]) z = math_ops.complex(x, y) - with self.cached_session(): - # Should not raise. - linear_operator_util.assert_zero_imag_part(z, message="ABC123").run() + # Should not raise. + self.evaluate( + linear_operator_util.assert_zero_imag_part(z, message="ABC123")) def test_complex_tensor_with_nonzero_imag_raises(self): x = ops.convert_to_tensor([1., 2, 0]) y = ops.convert_to_tensor([1., 2, 0]) z = math_ops.complex(x, y) - with self.cached_session(): - with self.assertRaisesOpError("ABC123"): - linear_operator_util.assert_zero_imag_part(z, message="ABC123").run() + with self.assertRaisesOpError("ABC123"): + self.evaluate( + linear_operator_util.assert_zero_imag_part(z, message="ABC123")) class AssertNoEntriesWithModulusZeroTest(test.TestCase): - @test_util.run_deprecated_v1 def test_nonzero_real_tensor_doesnt_raise(self): x = ops.convert_to_tensor([1., 2, 3]) - with self.cached_session(): - # Should not raise. - linear_operator_util.assert_no_entries_with_modulus_zero( - x, message="ABC123").run() + # Should not raise. + self.evaluate( + linear_operator_util.assert_no_entries_with_modulus_zero( + x, message="ABC123")) - @test_util.run_deprecated_v1 def test_nonzero_complex_tensor_doesnt_raise(self): x = ops.convert_to_tensor([1., 0, 3]) y = ops.convert_to_tensor([1., 2, 0]) z = math_ops.complex(x, y) - with self.cached_session(): - # Should not raise. - linear_operator_util.assert_no_entries_with_modulus_zero( - z, message="ABC123").run() + # Should not raise. + self.evaluate( + linear_operator_util.assert_no_entries_with_modulus_zero( + z, message="ABC123")) def test_zero_real_tensor_raises(self): x = ops.convert_to_tensor([1., 0, 3]) - with self.cached_session(): - with self.assertRaisesOpError("ABC123"): - linear_operator_util.assert_no_entries_with_modulus_zero( - x, message="ABC123").run() + with self.assertRaisesOpError("ABC123"): + self.evaluate( + linear_operator_util.assert_no_entries_with_modulus_zero( + x, message="ABC123")) def test_zero_complex_tensor_raises(self): x = ops.convert_to_tensor([1., 2, 0]) y = ops.convert_to_tensor([1., 2, 0]) z = math_ops.complex(x, y) - with self.cached_session(): - with self.assertRaisesOpError("ABC123"): - linear_operator_util.assert_no_entries_with_modulus_zero( - z, message="ABC123").run() + with self.assertRaisesOpError("ABC123"): + self.evaluate( + linear_operator_util.assert_no_entries_with_modulus_zero( + z, message="ABC123")) class BroadcastMatrixBatchDimsTest(test.TestCase): @@ -107,10 +101,8 @@ class BroadcastMatrixBatchDimsTest(test.TestCase): tensor, = linear_operator_util.broadcast_matrix_batch_dims([arr]) self.assertTrue(isinstance(tensor, ops.Tensor)) - with self.cached_session(): - self.assertAllClose(arr, self.evaluate(tensor)) + self.assertAllClose(arr, self.evaluate(tensor)) - @test_util.run_deprecated_v1 def test_static_dims_broadcast(self): # x.batch_shape = [3, 1, 2] # y.batch_shape = [4, 1] @@ -123,12 +115,11 @@ class BroadcastMatrixBatchDimsTest(test.TestCase): x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x, y]) - with self.cached_session() as sess: - self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape()) - self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape()) - x_bc_, y_bc_ = self.evaluate([x_bc, y_bc]) - self.assertAllClose(x_bc_expected, x_bc_) - self.assertAllClose(y_bc_expected, y_bc_) + self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape()) + self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape()) + x_bc_, y_bc_ = self.evaluate([x_bc, y_bc]) + self.assertAllClose(x_bc_expected, x_bc_) + self.assertAllClose(y_bc_expected, y_bc_) def test_static_dims_broadcast_second_arg_higher_rank(self): # x.batch_shape = [1, 2] @@ -142,14 +133,12 @@ class BroadcastMatrixBatchDimsTest(test.TestCase): x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x, y]) - with self.cached_session() as sess: - self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape()) - self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape()) - x_bc_, y_bc_ = self.evaluate([x_bc, y_bc]) - self.assertAllClose(x_bc_expected, x_bc_) - self.assertAllClose(y_bc_expected, y_bc_) + self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape()) + self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape()) + x_bc_, y_bc_ = self.evaluate([x_bc, y_bc]) + self.assertAllClose(x_bc_expected, x_bc_) + self.assertAllClose(y_bc_expected, y_bc_) - @test_util.run_deprecated_v1 def test_dynamic_dims_broadcast_32bit(self): # x.batch_shape = [3, 1, 2] # y.batch_shape = [4, 1] @@ -160,17 +149,15 @@ class BroadcastMatrixBatchDimsTest(test.TestCase): x_bc_expected = x + batch_of_zeros y_bc_expected = y + batch_of_zeros - x_ph = array_ops.placeholder(dtypes.float32) - y_ph = array_ops.placeholder(dtypes.float32) + x_ph = array_ops.placeholder_with_default(x, shape=None) + y_ph = array_ops.placeholder_with_default(y, shape=None) x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x_ph, y_ph]) - with self.cached_session() as sess: - x_bc_, y_bc_ = sess.run([x_bc, y_bc], feed_dict={x_ph: x, y_ph: y}) - self.assertAllClose(x_bc_expected, x_bc_) - self.assertAllClose(y_bc_expected, y_bc_) + x_bc_, y_bc_ = self.evaluate([x_bc, y_bc]) + self.assertAllClose(x_bc_expected, x_bc_) + self.assertAllClose(y_bc_expected, y_bc_) - @test_util.run_deprecated_v1 def test_dynamic_dims_broadcast_32bit_second_arg_higher_rank(self): # x.batch_shape = [1, 2] # y.batch_shape = [3, 4, 1] @@ -181,15 +168,14 @@ class BroadcastMatrixBatchDimsTest(test.TestCase): x_bc_expected = x + batch_of_zeros y_bc_expected = y + batch_of_zeros - x_ph = array_ops.placeholder(dtypes.float32) - y_ph = array_ops.placeholder(dtypes.float32) + x_ph = array_ops.placeholder_with_default(x, shape=None) + y_ph = array_ops.placeholder_with_default(y, shape=None) x_bc, y_bc = linear_operator_util.broadcast_matrix_batch_dims([x_ph, y_ph]) - with self.cached_session() as sess: - x_bc_, y_bc_ = sess.run([x_bc, y_bc], feed_dict={x_ph: x, y_ph: y}) - self.assertAllClose(x_bc_expected, x_bc_) - self.assertAllClose(y_bc_expected, y_bc_) + x_bc_, y_bc_ = self.evaluate([x_bc, y_bc]) + self.assertAllClose(x_bc_expected, x_bc_) + self.assertAllClose(y_bc_expected, y_bc_) def test_less_than_two_dims_raises_static(self): x = rng.rand(3) @@ -204,20 +190,17 @@ class BroadcastMatrixBatchDimsTest(test.TestCase): class CholeskySolveWithBroadcastTest(test.TestCase): - @test_util.run_deprecated_v1 def test_static_dims_broadcast(self): # batch_shape = [2] chol = rng.rand(3, 3) rhs = rng.rand(2, 3, 7) chol_broadcast = chol + np.zeros((2, 1, 1)) - with self.cached_session(): - result = linear_operator_util.cholesky_solve_with_broadcast(chol, rhs) - self.assertAllEqual((2, 3, 7), result.get_shape()) - expected = linalg_ops.cholesky_solve(chol_broadcast, rhs) - self.assertAllClose(expected.eval(), self.evaluate(result)) + result = linear_operator_util.cholesky_solve_with_broadcast(chol, rhs) + self.assertAllEqual((2, 3, 7), result.get_shape()) + expected = linalg_ops.cholesky_solve(chol_broadcast, rhs) + self.assertAllClose(*self.evaluate([expected, result])) - @test_util.run_deprecated_v1 def test_dynamic_dims_broadcast_64bit(self): # batch_shape = [2, 2] chol = rng.rand(2, 3, 3) @@ -225,40 +208,29 @@ class CholeskySolveWithBroadcastTest(test.TestCase): chol_broadcast = chol + np.zeros((2, 2, 1, 1)) rhs_broadcast = rhs + np.zeros((2, 2, 1, 1)) - chol_ph = array_ops.placeholder(dtypes.float64) - rhs_ph = array_ops.placeholder(dtypes.float64) + chol_ph = array_ops.placeholder_with_default(chol, shape=None) + rhs_ph = array_ops.placeholder_with_default(rhs, shape=None) - with self.cached_session() as sess: - result, expected = sess.run( - [ - linear_operator_util.cholesky_solve_with_broadcast( - chol_ph, rhs_ph), - linalg_ops.cholesky_solve(chol_broadcast, rhs_broadcast) - ], - feed_dict={ - chol_ph: chol, - rhs_ph: rhs, - }) - self.assertAllClose(expected, result) + result, expected = self.evaluate([ + linear_operator_util.cholesky_solve_with_broadcast(chol_ph, rhs_ph), + linalg_ops.cholesky_solve(chol_broadcast, rhs_broadcast) + ]) + self.assertAllClose(expected, result) class MatrixSolveWithBroadcastTest(test.TestCase): - @test_util.run_deprecated_v1 def test_static_dims_broadcast_matrix_has_extra_dims(self): # batch_shape = [2] matrix = rng.rand(2, 3, 3) rhs = rng.rand(3, 7) rhs_broadcast = rhs + np.zeros((2, 1, 1)) - with self.cached_session(): - result = linear_operator_util.matrix_solve_with_broadcast( - matrix, rhs) - self.assertAllEqual((2, 3, 7), result.get_shape()) - expected = linalg_ops.matrix_solve(matrix, rhs_broadcast) - self.assertAllClose(expected.eval(), self.evaluate(result)) + result = linear_operator_util.matrix_solve_with_broadcast(matrix, rhs) + self.assertAllEqual((2, 3, 7), result.get_shape()) + expected = linalg_ops.matrix_solve(matrix, rhs_broadcast) + self.assertAllClose(*self.evaluate([expected, result])) - @test_util.run_deprecated_v1 def test_static_dims_broadcast_rhs_has_extra_dims(self): # Since the second arg has extra dims, and the domain dim of the first arg # is larger than the number of linear equations, code will "flip" the extra @@ -271,13 +243,11 @@ class MatrixSolveWithBroadcastTest(test.TestCase): rhs = rng.rand(2, 3, 2) matrix_broadcast = matrix + np.zeros((2, 1, 1)) - with self.cached_session(): - result = linear_operator_util.matrix_solve_with_broadcast(matrix, rhs) - self.assertAllEqual((2, 3, 2), result.get_shape()) - expected = linalg_ops.matrix_solve(matrix_broadcast, rhs) - self.assertAllClose(expected.eval(), self.evaluate(result)) + result = linear_operator_util.matrix_solve_with_broadcast(matrix, rhs) + self.assertAllEqual((2, 3, 2), result.get_shape()) + expected = linalg_ops.matrix_solve(matrix_broadcast, rhs) + self.assertAllClose(*self.evaluate([expected, result])) - @test_util.run_deprecated_v1 def test_static_dims_broadcast_rhs_has_extra_dims_dynamic(self): # Since the second arg has extra dims, and the domain dim of the first arg # is larger than the number of linear equations, code will "flip" the extra @@ -290,22 +260,14 @@ class MatrixSolveWithBroadcastTest(test.TestCase): rhs = rng.rand(2, 3, 2) matrix_broadcast = matrix + np.zeros((2, 1, 1)) - matrix_ph = array_ops.placeholder(dtypes.float64, shape=[None, None]) - rhs_ph = array_ops.placeholder(dtypes.float64, shape=[None, None, None]) + matrix_ph = array_ops.placeholder_with_default(matrix, shape=[None, None]) + rhs_ph = array_ops.placeholder_with_default(rhs, shape=[None, None, None]) - with self.cached_session(): - result = linear_operator_util.matrix_solve_with_broadcast(matrix_ph, - rhs_ph) - self.assertAllEqual(3, result.shape.ndims) - expected = linalg_ops.matrix_solve(matrix_broadcast, rhs) - self.assertAllClose( - self.evaluate(expected), - result.eval(feed_dict={ - matrix_ph: matrix, - rhs_ph: rhs - })) + result = linear_operator_util.matrix_solve_with_broadcast(matrix_ph, rhs_ph) + self.assertAllEqual(3, result.shape.ndims) + expected = linalg_ops.matrix_solve(matrix_broadcast, rhs) + self.assertAllClose(*self.evaluate([expected, result])) - @test_util.run_deprecated_v1 def test_static_dims_broadcast_rhs_has_extra_dims_and_adjoint(self): # Since the second arg has extra dims, and the domain dim of the first arg # is larger than the number of linear equations, code will "flip" the extra @@ -318,14 +280,12 @@ class MatrixSolveWithBroadcastTest(test.TestCase): rhs = rng.rand(2, 3, 2) matrix_broadcast = matrix + np.zeros((2, 1, 1)) - with self.cached_session(): - result = linear_operator_util.matrix_solve_with_broadcast( - matrix, rhs, adjoint=True) - self.assertAllEqual((2, 3, 2), result.get_shape()) - expected = linalg_ops.matrix_solve(matrix_broadcast, rhs, adjoint=True) - self.assertAllClose(expected.eval(), self.evaluate(result)) + result = linear_operator_util.matrix_solve_with_broadcast( + matrix, rhs, adjoint=True) + self.assertAllEqual((2, 3, 2), result.get_shape()) + expected = linalg_ops.matrix_solve(matrix_broadcast, rhs, adjoint=True) + self.assertAllClose(*self.evaluate([expected, result])) - @test_util.run_deprecated_v1 def test_dynamic_dims_broadcast_64bit(self): # batch_shape = [2, 2] matrix = rng.rand(2, 3, 3) @@ -333,40 +293,30 @@ class MatrixSolveWithBroadcastTest(test.TestCase): matrix_broadcast = matrix + np.zeros((2, 2, 1, 1)) rhs_broadcast = rhs + np.zeros((2, 2, 1, 1)) - matrix_ph = array_ops.placeholder(dtypes.float64) - rhs_ph = array_ops.placeholder(dtypes.float64) + matrix_ph = array_ops.placeholder_with_default(matrix, shape=None) + rhs_ph = array_ops.placeholder_with_default(rhs, shape=None) - with self.cached_session() as sess: - result, expected = sess.run( - [ - linear_operator_util.matrix_solve_with_broadcast( - matrix_ph, rhs_ph), - linalg_ops.matrix_solve(matrix_broadcast, rhs_broadcast) - ], - feed_dict={ - matrix_ph: matrix, - rhs_ph: rhs, - }) - self.assertAllClose(expected, result) + result, expected = self.evaluate([ + linear_operator_util.matrix_solve_with_broadcast(matrix_ph, rhs_ph), + linalg_ops.matrix_solve(matrix_broadcast, rhs_broadcast) + ]) + self.assertAllClose(expected, result) class MatrixTriangularSolveWithBroadcastTest(test.TestCase): - @test_util.run_deprecated_v1 def test_static_dims_broadcast_matrix_has_extra_dims(self): # batch_shape = [2] matrix = rng.rand(2, 3, 3) rhs = rng.rand(3, 7) rhs_broadcast = rhs + np.zeros((2, 1, 1)) - with self.cached_session(): - result = linear_operator_util.matrix_triangular_solve_with_broadcast( - matrix, rhs) - self.assertAllEqual((2, 3, 7), result.get_shape()) - expected = linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast) - self.assertAllClose(expected.eval(), self.evaluate(result)) + result = linear_operator_util.matrix_triangular_solve_with_broadcast( + matrix, rhs) + self.assertAllEqual((2, 3, 7), result.get_shape()) + expected = linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast) + self.assertAllClose(*self.evaluate([expected, result])) - @test_util.run_deprecated_v1 def test_static_dims_broadcast_rhs_has_extra_dims(self): # Since the second arg has extra dims, and the domain dim of the first arg # is larger than the number of linear equations, code will "flip" the extra @@ -379,14 +329,12 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase): rhs = rng.rand(2, 3, 2) matrix_broadcast = matrix + np.zeros((2, 1, 1)) - with self.cached_session(): - result = linear_operator_util.matrix_triangular_solve_with_broadcast( - matrix, rhs) - self.assertAllEqual((2, 3, 2), result.get_shape()) - expected = linalg_ops.matrix_triangular_solve(matrix_broadcast, rhs) - self.assertAllClose(expected.eval(), self.evaluate(result)) + result = linear_operator_util.matrix_triangular_solve_with_broadcast( + matrix, rhs) + self.assertAllEqual((2, 3, 2), result.get_shape()) + expected = linalg_ops.matrix_triangular_solve(matrix_broadcast, rhs) + self.assertAllClose(*self.evaluate([expected, result])) - @test_util.run_deprecated_v1 def test_static_dims_broadcast_rhs_has_extra_dims_and_adjoint(self): # Since the second arg has extra dims, and the domain dim of the first arg # is larger than the number of linear equations, code will "flip" the extra @@ -399,36 +347,28 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase): rhs = rng.rand(2, 3, 2) matrix_broadcast = matrix + np.zeros((2, 1, 1)) - with self.cached_session(): - result = linear_operator_util.matrix_triangular_solve_with_broadcast( - matrix, rhs, adjoint=True) - self.assertAllEqual((2, 3, 2), result.get_shape()) - expected = linalg_ops.matrix_triangular_solve( - matrix_broadcast, rhs, adjoint=True) - self.assertAllClose(expected.eval(), self.evaluate(result)) + result = linear_operator_util.matrix_triangular_solve_with_broadcast( + matrix, rhs, adjoint=True) + self.assertAllEqual((2, 3, 2), result.get_shape()) + expected = linalg_ops.matrix_triangular_solve( + matrix_broadcast, rhs, adjoint=True) + self.assertAllClose(*self.evaluate([expected, result])) - @test_util.run_deprecated_v1 def test_dynamic_dims_broadcast_64bit(self): # batch_shape = [2] matrix = rng.rand(2, 3, 3) rhs = rng.rand(3, 7) rhs_broadcast = rhs + np.zeros((2, 1, 1)) - matrix_ph = array_ops.placeholder(dtypes.float64) - rhs_ph = array_ops.placeholder(dtypes.float64) + matrix_ph = array_ops.placeholder_with_default(matrix, shape=None) + rhs_ph = array_ops.placeholder_with_default(rhs, shape=None) - with self.cached_session() as sess: - result, expected = sess.run( - [ - linear_operator_util.matrix_triangular_solve_with_broadcast( - matrix_ph, rhs_ph), - linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast) - ], - feed_dict={ - matrix_ph: matrix, - rhs_ph: rhs, - }) - self.assertAllClose(expected, result) + result, expected = self.evaluate([ + linear_operator_util.matrix_triangular_solve_with_broadcast( + matrix_ph, rhs_ph), + linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast) + ]) + self.assertAllClose(expected, result) class DomainDimensionStubOperator(object): @@ -442,22 +382,21 @@ class DomainDimensionStubOperator(object): class AssertCompatibleMatrixDimensionsTest(test.TestCase): - @test_util.run_deprecated_v1 def test_compatible_dimensions_do_not_raise(self): - with self.cached_session(): - x = ops.convert_to_tensor(rng.rand(2, 3, 4)) - operator = DomainDimensionStubOperator(3) - # Should not raise - linear_operator_util.assert_compatible_matrix_dimensions( - operator, x).run() # pyformat: disable + x = ops.convert_to_tensor(rng.rand(2, 3, 4)) + operator = DomainDimensionStubOperator(3) + # Should not raise + self.evaluate( + linear_operator_util.assert_compatible_matrix_dimensions(operator, x)) def test_incompatible_dimensions_raise(self): - with self.cached_session(): - x = ops.convert_to_tensor(rng.rand(2, 4, 4)) - operator = DomainDimensionStubOperator(3) - with self.assertRaisesOpError("Incompatible matrix dimensions"): - linear_operator_util.assert_compatible_matrix_dimensions( - operator, x).run() # pyformat: disable + x = ops.convert_to_tensor(rng.rand(2, 4, 4)) + operator = DomainDimensionStubOperator(3) + # pylint: disable=g-error-prone-assert-raises + with self.assertRaisesOpError("Dimensions are not compatible"): + self.evaluate( + linear_operator_util.assert_compatible_matrix_dimensions(operator, x)) + # pylint: enable=g-error-prone-assert-raises class DummyOperatorWithHint(object): diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py index 60f9c4820e4..49bbc69149a 100644 --- a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py +++ b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py @@ -22,6 +22,7 @@ import numpy as np from tensorflow.python.framework import dtypes from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops +from tensorflow.python.ops import variables as variables_module from tensorflow.python.ops.linalg import linalg as linalg_lib from tensorflow.python.ops.linalg import linear_operator_test_util from tensorflow.python.platform import test @@ -30,6 +31,7 @@ from tensorflow.python.platform import test rng = np.random.RandomState(2016) +@test_util.run_all_in_graph_and_eager_modes class LinearOperatorZerosTest( linear_operator_test_util.SquareLinearOperatorDerivedClassTest): """Most tests done in the base class LinearOperatorDerivedClassTest.""" @@ -75,11 +77,10 @@ class LinearOperatorZerosTest( operator = linalg_lib.LinearOperatorZeros(num_rows=2) operator.assert_non_singular() - @test_util.run_deprecated_v1 def test_assert_self_adjoint(self): with self.cached_session(): operator = linalg_lib.LinearOperatorZeros(num_rows=2) - operator.assert_self_adjoint().run() # Should not fail + self.evaluate(operator.assert_self_adjoint()) # Should not fail def test_non_scalar_num_rows_raises_static(self): with self.assertRaisesRegexp(ValueError, "must be a 0-D Tensor"): @@ -111,46 +112,37 @@ class LinearOperatorZerosTest( with self.assertRaisesRegexp(ValueError, "must be non-negative"): linalg_lib.LinearOperatorZeros(num_rows=2, batch_shape=[-2]) - @test_util.run_deprecated_v1 def test_non_scalar_num_rows_raises_dynamic(self): with self.cached_session(): - num_rows = array_ops.placeholder(dtypes.int32) - operator = linalg_lib.LinearOperatorZeros( - num_rows, assert_proper_shapes=True) - with self.assertRaisesOpError("must be a 0-D Tensor"): - operator.to_dense().eval(feed_dict={num_rows: [2]}) + num_rows = array_ops.placeholder_with_default([2], shape=None) + with self.assertRaisesError("must be a 0-D Tensor"): + operator = linalg_lib.LinearOperatorZeros( + num_rows, assert_proper_shapes=True) + self.evaluate(operator.to_dense()) - @test_util.run_deprecated_v1 def test_negative_num_rows_raises_dynamic(self): with self.cached_session(): - n = array_ops.placeholder(dtypes.int32) - operator = linalg_lib.LinearOperatorZeros( - num_rows=n, assert_proper_shapes=True) - with self.assertRaisesOpError("must be non-negative"): - operator.to_dense().eval(feed_dict={n: -2}) + n = array_ops.placeholder_with_default(-2, shape=None) + with self.assertRaisesError("must be non-negative"): + operator = linalg_lib.LinearOperatorZeros( + num_rows=n, assert_proper_shapes=True) + self.evaluate(operator.to_dense()) - operator = linalg_lib.LinearOperatorZeros( - num_rows=2, num_columns=n, assert_proper_shapes=True) - with self.assertRaisesOpError("must be non-negative"): - operator.to_dense().eval(feed_dict={n: -2}) - - @test_util.run_deprecated_v1 def test_non_1d_batch_shape_raises_dynamic(self): with self.cached_session(): - batch_shape = array_ops.placeholder(dtypes.int32) - operator = linalg_lib.LinearOperatorZeros( - num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True) - with self.assertRaisesOpError("must be a 1-D"): - operator.to_dense().eval(feed_dict={batch_shape: 2}) + batch_shape = array_ops.placeholder_with_default(2, shape=None) + with self.assertRaisesError("must be a 1-D"): + operator = linalg_lib.LinearOperatorZeros( + num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True) + self.evaluate(operator.to_dense()) - @test_util.run_deprecated_v1 def test_negative_batch_shape_raises_dynamic(self): with self.cached_session(): - batch_shape = array_ops.placeholder(dtypes.int32) - operator = linalg_lib.LinearOperatorZeros( - num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True) - with self.assertRaisesOpError("must be non-negative"): - operator.to_dense().eval(feed_dict={batch_shape: [-2]}) + batch_shape = array_ops.placeholder_with_default([-2], shape=None) + with self.assertRaisesError("must be non-negative"): + operator = linalg_lib.LinearOperatorZeros( + num_rows=2, batch_shape=batch_shape, assert_proper_shapes=True) + self.evaluate(operator.to_dense()) def test_wrong_matrix_dimensions_raises_static(self): operator = linalg_lib.LinearOperatorZeros(num_rows=2) @@ -158,17 +150,15 @@ class LinearOperatorZerosTest( with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"): operator.matmul(x) - @test_util.run_deprecated_v1 def test_wrong_matrix_dimensions_raises_dynamic(self): - num_rows = array_ops.placeholder(dtypes.int32) - x = array_ops.placeholder(dtypes.float32) + num_rows = array_ops.placeholder_with_default(2, shape=None) + x = array_ops.placeholder_with_default(rng.rand(3, 3), shape=None) with self.cached_session(): - operator = linalg_lib.LinearOperatorZeros( - num_rows, assert_proper_shapes=True) - y = operator.matmul(x) - with self.assertRaisesOpError("Incompatible.*dimensions"): - y.eval(feed_dict={num_rows: 2, x: rng.rand(3, 3)}) + with self.assertRaisesError("Dimensions.*not.compatible"): + operator = linalg_lib.LinearOperatorZeros( + num_rows, assert_proper_shapes=True, dtype=dtypes.float64) + self.evaluate(operator.matmul(x)) def test_is_x_flags(self): # The is_x flags are by default all True. @@ -188,7 +178,20 @@ class LinearOperatorZerosTest( operator2.matmul(operator1), linalg_lib.LinearOperatorZeros)) + def test_ref_type_shape_args_raises(self): + with self.assertRaisesRegexp(TypeError, "num_rows.cannot.be.reference"): + linalg_lib.LinearOperatorZeros(num_rows=variables_module.Variable(2)) + with self.assertRaisesRegexp(TypeError, "num_columns.cannot.be.reference"): + linalg_lib.LinearOperatorZeros( + num_rows=2, num_columns=variables_module.Variable(3)) + + with self.assertRaisesRegexp(TypeError, "batch_shape.cannot.be.reference"): + linalg_lib.LinearOperatorZeros( + num_rows=2, batch_shape=variables_module.Variable([2])) + + +@test_util.run_all_in_graph_and_eager_modes class LinearOperatorZerosNotSquareTest( linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest): diff --git a/tensorflow/python/ops/linalg/linear_operator_householder.py b/tensorflow/python/ops/linalg/linear_operator_householder.py index be8f05bbff1..305ef4f51d8 100644 --- a/tensorflow/python/ops/linalg/linear_operator_householder.py +++ b/tensorflow/python/ops/linalg/linear_operator_householder.py @@ -25,6 +25,7 @@ from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops.linalg import linalg_impl as linalg from tensorflow.python.ops.linalg import linear_operator +from tensorflow.python.ops.linalg import linear_operator_util from tensorflow.python.util.tf_export import tf_export __all__ = ["LinearOperatorHouseholder",] @@ -123,7 +124,7 @@ class LinearOperatorHouseholder(linear_operator.LinearOperator): """ with ops.name_scope(name, values=[reflection_axis]): - self._reflection_axis = ops.convert_to_tensor( + self._reflection_axis = linear_operator_util.convert_nonref_to_tensor( reflection_axis, name="reflection_axis") self._check_reflection_axis(self._reflection_axis) @@ -194,9 +195,10 @@ class LinearOperatorHouseholder(linear_operator.LinearOperator): # Note that because this is a reflection, it lies in O(n) (for real vector # spaces) or U(n) (for complex vector spaces), and thus is its own adjoint. + reflection_axis = ops.convert_to_tensor(self.reflection_axis) x = linalg.adjoint(x) if adjoint_arg else x - normalized_axis = self.reflection_axis / linalg.norm( - self.reflection_axis, axis=-1, keepdims=True) + normalized_axis = reflection_axis / linalg.norm( + reflection_axis, axis=-1, keepdims=True) mat = normalized_axis[..., array_ops.newaxis] x_dot_normalized_v = math_ops.matmul(mat, x, adjoint_a=True) diff --git a/tensorflow/python/ops/linalg/linear_operator_identity.py b/tensorflow/python/ops/linalg/linear_operator_identity.py index 1b019158023..f3c762a9686 100644 --- a/tensorflow/python/ops/linalg/linear_operator_identity.py +++ b/tensorflow/python/ops/linalg/linear_operator_identity.py @@ -250,6 +250,7 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity): negative. ValueError: If any of the following is not `True`: `{is_self_adjoint, is_non_singular, is_positive_definite}`. + TypeError: If `num_rows` or `batch_shape` is ref-type (e.g. Variable). """ dtype = dtype or dtypes.float32 self._assert_proper_shapes = assert_proper_shapes @@ -273,6 +274,9 @@ class LinearOperatorIdentity(BaseLinearOperatorIdentity): is_square=is_square, name=name) + linear_operator_util.assert_not_ref_type(num_rows, "num_rows") + linear_operator_util.assert_not_ref_type(batch_shape, "batch_shape") + self._num_rows = linear_operator_util.shape_tensor( num_rows, name="num_rows") self._num_rows_static = tensor_util.constant_value(self._num_rows) @@ -589,7 +593,8 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity): self._assert_proper_shapes = assert_proper_shapes with ops.name_scope(name, values=[multiplier, num_rows]): - self._multiplier = ops.convert_to_tensor(multiplier, name="multiplier") + self._multiplier = linear_operator_util.convert_nonref_to_tensor( + multiplier, name="multiplier") # Check and auto-set hints. if not self._multiplier.dtype.is_complex: @@ -601,20 +606,16 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity): if not is_square: raise ValueError("A ScaledIdentity operator is always square.") + linear_operator_util.assert_not_ref_type(num_rows, "num_rows") + super(LinearOperatorScaledIdentity, self).__init__( - dtype=self._multiplier.dtype, + dtype=self._multiplier.dtype.base_dtype, is_non_singular=is_non_singular, is_self_adjoint=is_self_adjoint, is_positive_definite=is_positive_definite, is_square=is_square, name=name) - # Shape [B1,...Bb, 1, 1] - self._multiplier_matrix = array_ops.expand_dims( - array_ops.expand_dims(self.multiplier, -1), -1) - self._multiplier_matrix_conj = math_ops.conj(self._multiplier_matrix) - self._abs_multiplier = math_ops.abs(self.multiplier) - self._num_rows = linear_operator_util.shape_tensor( num_rows, name="num_rows") self._num_rows_static = tensor_util.constant_value(self._num_rows) @@ -652,34 +653,34 @@ class LinearOperatorScaledIdentity(BaseLinearOperatorIdentity): imag_multiplier, message="LinearOperator was not self-adjoint") + def _make_multiplier_matrix(self, conjugate=False): + # Shape [B1,...Bb, 1, 1] + multiplier_matrix = array_ops.expand_dims( + array_ops.expand_dims(self.multiplier, -1), -1) + if conjugate: + multiplier_matrix = math_ops.conj(multiplier_matrix) + return multiplier_matrix + def _matmul(self, x, adjoint=False, adjoint_arg=False): x = linalg.adjoint(x) if adjoint_arg else x - if adjoint: - matrix = self._multiplier_matrix_conj - else: - matrix = self._multiplier_matrix if self._assert_proper_shapes: aps = linear_operator_util.assert_compatible_matrix_dimensions(self, x) x = control_flow_ops.with_dependencies([aps], x) - return x * matrix + return x * self._make_multiplier_matrix(conjugate=adjoint) def _determinant(self): return self.multiplier**self._num_rows_cast_to_dtype def _log_abs_determinant(self): return self._num_rows_cast_to_real_dtype * math_ops.log( - self._abs_multiplier) + math_ops.abs(self.multiplier)) def _solve(self, rhs, adjoint=False, adjoint_arg=False): rhs = linalg.adjoint(rhs) if adjoint_arg else rhs - if adjoint: - matrix = self._multiplier_matrix_conj - else: - matrix = self._multiplier_matrix if self._assert_proper_shapes: aps = linear_operator_util.assert_compatible_matrix_dimensions(self, rhs) rhs = control_flow_ops.with_dependencies([aps], rhs) - return rhs / matrix + return rhs / self._make_multiplier_matrix(conjugate=adjoint) def _trace(self): # Get Tensor of all ones of same shape as self.batch_shape. diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py index cc2e1baf2e9..e18a1184455 100644 --- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py +++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py @@ -18,7 +18,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops @@ -145,10 +144,9 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator): is_square = True with ops.name_scope(name, values=[tril]): - self._tril = ops.convert_to_tensor(tril, name="tril") + self._tril = linear_operator_util.convert_nonref_to_tensor(tril, + name="tril") self._check_tril(self._tril) - self._tril = array_ops.matrix_band_part(tril, -1, 0) - self._diag = array_ops.matrix_diag_part(self._tril) super(LinearOperatorLowerTriangular, self).__init__( dtype=self._tril.dtype, @@ -161,24 +159,20 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator): def _check_tril(self, tril): """Static check of the `tril` argument.""" - allowed_dtypes = [ - dtypes.float16, - dtypes.float32, - dtypes.float64, - dtypes.complex64, - dtypes.complex128, - ] - dtype = tril.dtype - if dtype not in allowed_dtypes: - raise TypeError( - "Argument tril must have dtype in %s. Found: %s" - % (allowed_dtypes, dtype)) if tril.get_shape().ndims is not None and tril.get_shape().ndims < 2: raise ValueError( "Argument tril must have at least 2 dimensions. Found: %s" % tril) + def _get_tril(self): + """Gets the `tril` kwarg, with upper part zero-d out.""" + return array_ops.matrix_band_part(self._tril, -1, 0) + + def _get_diag(self): + """Gets the diagonal part of `tril` kwarg.""" + return array_ops.matrix_diag_part(self._tril) + def _shape(self): return self._tril.get_shape() @@ -187,27 +181,24 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator): def _assert_non_singular(self): return linear_operator_util.assert_no_entries_with_modulus_zero( - self._diag, + self._get_diag(), message="Singular operator: Diagonal contained zero values.") def _matmul(self, x, adjoint=False, adjoint_arg=False): return math_ops.matmul( - self._tril, x, adjoint_a=adjoint, adjoint_b=adjoint_arg) + self._get_tril(), x, adjoint_a=adjoint, adjoint_b=adjoint_arg) def _determinant(self): - return math_ops.reduce_prod(self._diag, axis=[-1]) + return math_ops.reduce_prod(self._get_diag(), axis=[-1]) def _log_abs_determinant(self): return math_ops.reduce_sum( - math_ops.log(math_ops.abs(self._diag)), axis=[-1]) + math_ops.log(math_ops.abs(self._get_diag())), axis=[-1]) def _solve(self, rhs, adjoint=False, adjoint_arg=False): rhs = linalg.adjoint(rhs) if adjoint_arg else rhs return linear_operator_util.matrix_triangular_solve_with_broadcast( - self._tril, rhs, lower=True, adjoint=adjoint) + self._get_tril(), rhs, lower=True, adjoint=adjoint) def _to_dense(self): - return self._tril - - def _add_to_tensor(self, x): - return self._tril + x + return self._get_tril() diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py index 12cdb1178f6..3d1e1fc2e24 100644 --- a/tensorflow/python/ops/linalg/linear_operator_test_util.py +++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py @@ -24,6 +24,7 @@ import numpy as np import six from tensorflow.python.eager import backprop +from tensorflow.python.eager import context from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import random_seed @@ -51,6 +52,15 @@ class OperatorShapesInfo(object): self.__dict__.update(kwargs) +class CheckTapeSafeSkipOptions(object): + + # Skip checking this particular method. + DETERMINANT = "determinant" + DIAG_PART = "diag_part" + LOG_ABS_DETERMINANT = "log_abs_determinant" + TRACE = "trace" + + @six.add_metaclass(abc.ABCMeta) # pylint: disable=no-init class LinearOperatorDerivedClassTest(test.TestCase): """Tests for derived classes. @@ -174,18 +184,35 @@ class LinearOperatorDerivedClassTest(test.TestCase): # To skip "test_foo", add "foo" to this list. return [] - def check_tape_safe(self, operator): - """Check gradients are not None w.r.t. Variables. + def assertRaisesError(self, msg): + """assertRaisesRegexp or OpError, depending on context.executing_eagerly.""" + if context.executing_eagerly(): + return self.assertRaisesRegexp(Exception, msg) + return self.assertRaisesOpError(msg) + + def check_tape_safe(self, operator, skip_options=None): + """Check gradients are not None w.r.t. operator.variables. Meant to be called from the derived class. + This ensures grads are not w.r.t every variable in operator.variables. If + more fine-grained testing is needed, a custom test should be written. + Args: operator: LinearOperator. Exact checks done will depend on hints. + skip_options: Optional list of CheckTapeSafeSkipOptions. + Makes this test skip particular checks. """ + skip_options = skip_options or [] + + if not operator.variables: + raise AssertionError("`operator.variables` was empty") + def _assert_not_none(iterable): for item in iterable: self.assertIsNotNone(item) + # Tape tests that can be run on every operator below. with backprop.GradientTape() as tape: _assert_not_none(tape.gradient(operator.to_dense(), operator.variables)) @@ -193,23 +220,30 @@ class LinearOperatorDerivedClassTest(test.TestCase): _assert_not_none( tape.gradient(operator.adjoint().to_dense(), operator.variables)) - x = array_ops.ones(shape=operator.H.shape_tensor()[:-1]) + x = math_ops.cast( + array_ops.ones(shape=operator.H.shape_tensor()[:-1]), operator.dtype) with backprop.GradientTape() as tape: _assert_not_none(tape.gradient(operator.matvec(x), operator.variables)) + # Tests for square, but possibly non-singular operators below. if not operator.is_square: return - with backprop.GradientTape() as tape: - _assert_not_none( - tape.gradient(operator.determinant(), operator.variables)) + for option in [ + CheckTapeSafeSkipOptions.DETERMINANT, + CheckTapeSafeSkipOptions.LOG_ABS_DETERMINANT, + CheckTapeSafeSkipOptions.DIAG_PART, + CheckTapeSafeSkipOptions.TRACE, + ]: + with backprop.GradientTape() as tape: + if option not in skip_options: + _assert_not_none( + tape.gradient(getattr(operator, option)(), operator.variables)) - with backprop.GradientTape() as tape: - _assert_not_none(tape.gradient(operator.diag_part(), operator.variables)) - - with backprop.GradientTape() as tape: - _assert_not_none(tape.gradient(operator.trace(), operator.variables)) + # Tests for non-singular operators below. + if operator.is_non_singular is False: # pylint: disable=g-bool-id-comparison + return with backprop.GradientTape() as tape: _assert_not_none( @@ -218,6 +252,7 @@ class LinearOperatorDerivedClassTest(test.TestCase): with backprop.GradientTape() as tape: _assert_not_none(tape.gradient(operator.solvevec(x), operator.variables)) + # Tests for SPD operators below. if not (operator.is_self_adjoint and operator.is_positive_definite): return diff --git a/tensorflow/python/ops/linalg/linear_operator_util.py b/tensorflow/python/ops/linalg/linear_operator_util.py index 573d373ea93..3a27103bff7 100644 --- a/tensorflow/python/ops/linalg/linear_operator_util.py +++ b/tensorflow/python/ops/linalg/linear_operator_util.py @@ -157,6 +157,12 @@ def is_ref(x): hasattr(x, "shape"))) +def assert_not_ref_type(x, arg_name): + if is_ref(x): + raise TypeError( + "Argument %s cannot be reference type. Found: %s" % (arg_name, type(x))) + + ################################################################################ # Asserts. ################################################################################ @@ -223,7 +229,9 @@ def assert_compatible_matrix_dimensions(operator, x): assert_same_dd = check_ops.assert_equal( array_ops.shape(x)[-2], operator.domain_dimension_tensor(), - message=("Incompatible matrix dimensions. " + # This error message made to look similar to error raised by static check + # in the base class. + message=("Dimensions are not compatible. " "shape[-2] of argument to be the same as this operator")) return assert_same_dd diff --git a/tensorflow/python/ops/linalg/linear_operator_zeros.py b/tensorflow/python/ops/linalg/linear_operator_zeros.py index b8a79c065b3..619fe4b8f71 100644 --- a/tensorflow/python/ops/linalg/linear_operator_zeros.py +++ b/tensorflow/python/ops/linalg/linear_operator_zeros.py @@ -196,6 +196,10 @@ class LinearOperatorZeros(linear_operator.LinearOperator): is_square=is_square, name=name) + linear_operator_util.assert_not_ref_type(num_rows, "num_rows") + linear_operator_util.assert_not_ref_type(num_columns, "num_columns") + linear_operator_util.assert_not_ref_type(batch_shape, "batch_shape") + self._num_rows = linear_operator_util.shape_tensor( num_rows, name="num_rows") self._num_rows_static = tensor_util.constant_value(self._num_rows) From c2ba0b595a21231c72af2f2bf6ae23bdb8c9c15c Mon Sep 17 00:00:00 2001 From: jiakai Date: Tue, 18 Jun 2019 00:50:37 +0800 Subject: [PATCH 0371/3053] Reuse DeviceNameUtils::LocalName Change-Id: Ie1c644231b8c4154c1599ffd9630aa8f2785f07c --- tensorflow/core/distributed_runtime/BUILD | 1 + .../core/distributed_runtime/remote_device.cc | 14 ++------------ 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD index ef791c74d52..b33b785a600 100644 --- a/tensorflow/core/distributed_runtime/BUILD +++ b/tensorflow/core/distributed_runtime/BUILD @@ -286,6 +286,7 @@ cc_library( ":worker_cache", ":worker_interface", "//tensorflow/core:core_cpu_internal", + "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:worker_proto_cc", ], diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc index a4b19cbf157..346e772b3b8 100644 --- a/tensorflow/core/distributed_runtime/remote_device.cc +++ b/tensorflow/core/distributed_runtime/remote_device.cc @@ -26,24 +26,14 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/protobuf/worker.pb.h" +#include "tensorflow/core/util/device_name_utils.h" namespace tensorflow { -// TODO(zhifengc): We need to consolidate (full/partial) device name -// parsing into one place. -// -// Parses and returns the local device part (e.g., cpu:0, gpu:4). -string GetLocalDeviceName(StringPiece fullname) { - auto pos = fullname.rfind('/'); - CHECK_NE(pos, StringPiece::npos); - fullname.remove_prefix(pos + 1); - return string(fullname); -} - class RemoteDevice : public Device { public: RemoteDevice(Env* env, const DeviceAttributes& da) - : Device(env, da), local_dev_name_(GetLocalDeviceName(da.name())) {} + : Device(env, da), local_dev_name_(DeviceNameUtils::LocalName(da.name())) {} Status Sync() override { return Status::OK(); } Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; } From 4f910ac64bc80e430ca2c936de88f107e098cf4e Mon Sep 17 00:00:00 2001 From: Chao Mei Date: Mon, 22 Jul 2019 19:29:01 -0700 Subject: [PATCH 0372/3053] Make TfLiteInternalBackendContext as a interface-only abstract class. PiperOrigin-RevId: 259455436 --- .../lite/external_cpu_backend_context.cc | 2 +- .../lite/external_cpu_backend_context.h | 20 +++----------- .../lite/kernels/cpu_backend_context.cc | 4 +-- tensorflow/lite/kernels/cpu_backend_context.h | 27 +++++++++++-------- .../lite/kernels/cpu_backend_gemm_test.cc | 2 +- .../lite/kernels/cpu_backend_support.cc | 3 +-- .../kernels/cpu_backend_threadpool_test.cc | 4 +-- .../internal/depthwiseconv_quantized_test.cc | 2 +- 8 files changed, 27 insertions(+), 37 deletions(-) diff --git a/tensorflow/lite/external_cpu_backend_context.cc b/tensorflow/lite/external_cpu_backend_context.cc index 2be35c8baf7..df1fc01b8b9 100644 --- a/tensorflow/lite/external_cpu_backend_context.cc +++ b/tensorflow/lite/external_cpu_backend_context.cc @@ -22,7 +22,7 @@ TfLiteStatus RefreshExternalCpuBackendContext(TfLiteContext* context) { context->GetExternalContext(context, kTfLiteCpuBackendContext)); if (external_context && external_context->internal_backend_context() && context->recommended_num_threads != -1) { - external_context->internal_backend_context()->set_max_num_threads( + external_context->internal_backend_context()->SetMaxNumThreads( context->recommended_num_threads); } return kTfLiteOk; diff --git a/tensorflow/lite/external_cpu_backend_context.h b/tensorflow/lite/external_cpu_backend_context.h index 0d8763532c7..8d5125dec1f 100644 --- a/tensorflow/lite/external_cpu_backend_context.h +++ b/tensorflow/lite/external_cpu_backend_context.h @@ -27,27 +27,13 @@ namespace tflite { // generally a collection of utilities (i.e. a thread pool etc.) for TF Lite to // use certain keneral libraries, such as Gemmlowp, RUY, etc., to implement TF // Lite operators. -// TODO(b/130950871): Make this class as a interface-only abstract class. class TfLiteInternalBackendContext { public: virtual ~TfLiteInternalBackendContext() {} - int max_num_threads() const { return max_num_threads_; } - - virtual void set_max_num_threads(int max_num_threads) { - max_num_threads_ = max_num_threads; - } - - protected: - TfLiteInternalBackendContext() {} - - // The maximum number of threads used for parallelizing TfLite computation. - int max_num_threads_; - - private: - TfLiteInternalBackendContext(const TfLiteInternalBackendContext&) = delete; - TfLiteInternalBackendContext& operator=(const TfLiteInternalBackendContext&) = - delete; + // Set the maximum number of threads that could be used for parallelizing + // TfLite computation. + virtual void SetMaxNumThreads(int max_num_threads) = 0; }; // This TfLiteExternalContext-derived class is the default diff --git a/tensorflow/lite/kernels/cpu_backend_context.cc b/tensorflow/lite/kernels/cpu_backend_context.cc index f9a1ee0a86b..63f12208630 100644 --- a/tensorflow/lite/kernels/cpu_backend_context.cc +++ b/tensorflow/lite/kernels/cpu_backend_context.cc @@ -24,12 +24,12 @@ CpuBackendContext::CpuBackendContext() : TfLiteInternalBackendContext(), ruy_context_(new ruy::Context), gemmlowp_context_(new gemmlowp::GemmContext) { - set_max_num_threads(1); + SetMaxNumThreads(1); } CpuBackendContext::~CpuBackendContext() {} -void CpuBackendContext::set_max_num_threads(int max_num_threads) { +void CpuBackendContext::SetMaxNumThreads(int max_num_threads) { max_num_threads_ = max_num_threads; ruy_context_->max_num_threads = max_num_threads; gemmlowp_context_->set_max_num_threads(max_num_threads); diff --git a/tensorflow/lite/kernels/cpu_backend_context.h b/tensorflow/lite/kernels/cpu_backend_context.h index 00b12d8ba54..a55a951ac99 100644 --- a/tensorflow/lite/kernels/cpu_backend_context.h +++ b/tensorflow/lite/kernels/cpu_backend_context.h @@ -35,17 +35,11 @@ class CpuBackendContext final : public TfLiteInternalBackendContext { return gemmlowp_context_.get(); } - // Sets the maximum-number-of-threads-to-use parameter. - // This is only a means of passing around this information. - // cpu_backend_threadpool::Execute creates as many threads as it's - // asked to, regardless of this. Typically a call site would query - // cpu_backend_context->max_num_threads() and used that to determine - // the number of tasks to create and to give to - // cpu_backend_threadpool::Execute. - // - // This value also gets propagated to back-ends, where it plays the same - // information-only role. - void set_max_num_threads(int max_num_threads) override; + // Sets the maximum-number-of-threads-to-use parameter, only as a means of + // passing around this information. + void SetMaxNumThreads(int max_num_threads) override; + + int max_num_threads() const { return max_num_threads_; } private: // To enable a smooth transition from the current direct usage @@ -57,6 +51,17 @@ class CpuBackendContext final : public TfLiteInternalBackendContext { const std::unique_ptr ruy_context_; const std::unique_ptr gemmlowp_context_; + // The maxinum of threads used for parallelizing TfLite ops. However, + // cpu_backend_threadpool::Execute creates as many threads as it's + // asked to, regardless of this. Typically a call site would query + // cpu_backend_context->max_num_threads() and used that to determine + // the number of tasks to create and to give to + // cpu_backend_threadpool::Execute. + // + // This value also gets propagated to back-ends, where it plays the same + // information-only role. + int max_num_threads_; + CpuBackendContext(const CpuBackendContext&) = delete; }; diff --git a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc index c193d1b60cc..fe2792b88cd 100644 --- a/tensorflow/lite/kernels/cpu_backend_gemm_test.cc +++ b/tensorflow/lite/kernels/cpu_backend_gemm_test.cc @@ -363,7 +363,7 @@ void TestSomeGemm(int rows, int depth, int cols, const std::vector& golden) { CpuBackendContext cpu_backend_context; std::default_random_engine random_engine; - cpu_backend_context.set_max_num_threads(1 + (random_engine() % 8)); + cpu_backend_context.SetMaxNumThreads(1 + (random_engine() % 8)); const bool use_golden = !golden.empty(); diff --git a/tensorflow/lite/kernels/cpu_backend_support.cc b/tensorflow/lite/kernels/cpu_backend_support.cc index 64a41b2e1ec..ab47d5b7e99 100644 --- a/tensorflow/lite/kernels/cpu_backend_support.cc +++ b/tensorflow/lite/kernels/cpu_backend_support.cc @@ -46,8 +46,7 @@ CpuBackendContext* GetFromContext(TfLiteContext* context) { // that's wrapped inside ExternalCpuBackendContext. cpu_backend_context = new CpuBackendContext(); if (context->recommended_num_threads != -1) { - cpu_backend_context->set_max_num_threads( - context->recommended_num_threads); + cpu_backend_context->SetMaxNumThreads(context->recommended_num_threads); } external_context->set_internal_backend_context( std::unique_ptr(cpu_backend_context)); diff --git a/tensorflow/lite/kernels/cpu_backend_threadpool_test.cc b/tensorflow/lite/kernels/cpu_backend_threadpool_test.cc index 45208a383c5..5089323070a 100644 --- a/tensorflow/lite/kernels/cpu_backend_threadpool_test.cc +++ b/tensorflow/lite/kernels/cpu_backend_threadpool_test.cc @@ -61,10 +61,10 @@ void TestGenerateArrayOfIncrementingInts(int num_threads, int size) { ASSERT_EQ(num_threads, tasks.size()); CpuBackendContext context; - // This set_max_num_threads is only to satisfy an assertion in Execute. + // This SetMaxNumThreads is only to satisfy an assertion in Execute. // What actually determines the number of threads used is the parameter // passed to Execute, since Execute does 1:1 mapping of tasks to threads. - context.set_max_num_threads(num_threads); + context.SetMaxNumThreads(num_threads); // Execute tasks on the threadpool. cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), &context); diff --git a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc index fd5b89eaf73..1c3d0e9ad62 100644 --- a/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc +++ b/tensorflow/lite/kernels/internal/depthwiseconv_quantized_test.cc @@ -292,7 +292,7 @@ inline void DispatchDepthwiseConv( << " input_offset = " << params.input_offset; CpuBackendContext backend_context; - backend_context.set_max_num_threads(test_param.num_threads); + backend_context.SetMaxNumThreads(test_param.num_threads); optimized_ops::DepthwiseConv( params, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data, &backend_context); From 91425cf5975f73984b63910d4b5bdc0d13a3e9ec Mon Sep 17 00:00:00 2001 From: Eugene Brevdo Date: Mon, 22 Jul 2019 21:07:01 -0700 Subject: [PATCH 0373/3053] Make TensorList objects Refcounted. This drastically reduces the amount of refcounting of individual tensors inside TensorList when a TensorList variant is copied to a Variable or MutableDenseHashTable (and back). Same for operations like tf.stack that operate on Variant tensors and perform Variant copies implicitly. While this change adds a level of indirection into the TensorList object by adding a heap-allocated RefCounted object to contain the vector, it also reduces the size of the TensorList below the tf::Variant inlining threshold. This in turn removes a level of heap indirection and should cancel out any performance regressions for existing TensorList operations and small-size lists. PiperOrigin-RevId: 259464769 --- tensorflow/core/kernels/list_kernels.cc | 201 ++++++++------- tensorflow/core/kernels/list_kernels.h | 237 +++++++++++++----- .../python/kernel_tests/list_ops_test.py | 16 +- 3 files changed, 305 insertions(+), 149 deletions(-) diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc index afe4b24731b..c0f57b912c0 100644 --- a/tensorflow/core/kernels/list_kernels.cc +++ b/tensorflow/core/kernels/list_kernels.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include + #include "tensorflow/core/framework/allocator.h" #define EIGEN_USE_THREADS @@ -21,8 +22,6 @@ limitations under the License. #define EIGEN_USE_GPU #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM -#include "tensorflow/core/kernels/list_kernels.h" - #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" @@ -30,6 +29,7 @@ limitations under the License. #include "tensorflow/core/framework/variant.h" #include "tensorflow/core/framework/variant_op_registry.h" #include "tensorflow/core/kernels/concat_lib.h" +#include "tensorflow/core/kernels/list_kernels.h" #include "tensorflow/core/lib/core/coding.h" #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/util/util.h" @@ -38,20 +38,16 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -// Variant compatible type for a list of tensors. This is mutable but instances -// should never be mutated after stored in a variant tensor. -TensorList::TensorList(const TensorList& other) - : tensors(other.tensors), - element_shape(other.element_shape), - element_dtype(other.element_dtype), - max_num_elements(other.max_num_elements) {} +TensorList::~TensorList() { + if (tensors_) tensors_->Unref(); +} void TensorList::Encode(VariantTensorData* data) const { data->set_type_name(TypeName()); std::vector invalid_indices; - for (size_t i = 0; i < tensors.size(); i++) { - if (tensors.at(i).dtype() != DT_INVALID) { - *data->add_tensors() = tensors.at(i); + for (size_t i = 0; i < tensors().size(); i++) { + if (tensors().at(i).dtype() != DT_INVALID) { + *data->add_tensors() = tensors().at(i); } else { invalid_indices.push_back(i); } @@ -78,11 +74,11 @@ static Status TensorListDeviceCopy( to->element_shape = from.element_shape; to->element_dtype = from.element_dtype; to->max_num_elements = from.max_num_elements; - to->tensors.reserve(from.tensors.size()); - for (const Tensor& t : from.tensors) { - to->tensors.emplace_back(t.dtype()); + to->tensors().reserve(from.tensors().size()); + for (const Tensor& t : from.tensors()) { + to->tensors().emplace_back(t.dtype()); if (t.dtype() != DT_INVALID) { - TF_RETURN_IF_ERROR(copy(t, &to->tensors.back())); + TF_RETURN_IF_ERROR(copy(t, &to->tensors().back())); } } return Status::OK(); @@ -116,16 +112,16 @@ bool TensorList::Decode(const VariantTensorData& data) { } size_t total_num_tensors = data.tensors().size() + num_invalid_tensors; - tensors.reserve(total_num_tensors); + tensors().reserve(total_num_tensors); std::vector::iterator invalid_indices_it = invalid_indices.begin(); std::vector::const_iterator tensors_it = data.tensors().begin(); for (size_t i = 0; i < total_num_tensors; i++) { if (invalid_indices_it != invalid_indices.end() && *invalid_indices_it == i) { - tensors.emplace_back(Tensor(DT_INVALID)); + tensors().emplace_back(Tensor(DT_INVALID)); invalid_indices_it++; } else if (tensors_it != data.tensors().end()) { - tensors.emplace_back(*tensors_it); + tensors().emplace_back(*tensors_it); tensors_it++; } else { // VariantTensorData is corrupted. @@ -201,19 +197,31 @@ Status ForwardInputOrCreateNewList(OpKernelContext* c, int32 input_index, input_index, output_index, DT_VARIANT, TensorShape{}, c->input_memory_type(input_index), AllocatorAttributes()); Tensor* output_tensor; - if (maybe_output != nullptr) { - // Woohoo, forwarding succeeded! + if (maybe_output != nullptr && maybe_output->dtype() == DT_VARIANT && + maybe_output->NumElements() == 1) { output_tensor = maybe_output.get(); - c->set_output(output_index, *output_tensor); - } else { - // If forwarding is not possible allocate a new output tensor and copy - // the `input_list` to it. - AllocatorAttributes attr; - attr.set_on_host(true); - TF_RETURN_IF_ERROR( - c->allocate_output(output_index, {}, &output_tensor, attr)); - output_tensor->scalar()() = input_list; + TensorList* tmp_out = output_tensor->scalar()().get(); + if (tmp_out == nullptr) { + return errors::InvalidArgument( + "Expected input ", input_index, " to be a TensorList but saw ", + output_tensor->scalar()().TypeName()); + } + if (tmp_out->RefCountIsOne()) { + // Woohoo, forwarding succeeded! + c->set_output(output_index, *output_tensor); + *output_list = tmp_out; + return Status::OK(); + } } + + // If forwarding is not possible allocate a new output tensor and copy + // the `input_list` to it. + AllocatorAttributes attr; + attr.set_on_host(true); + TF_RETURN_IF_ERROR( + c->allocate_output(output_index, {}, &output_tensor, attr)); + output_tensor->scalar()() = input_list.Copy(); + *output_list = output_tensor->scalar()().get(); return Status::OK(); } @@ -295,15 +303,15 @@ class TensorListPushBack : public OpKernel { if (l->max_num_elements != -1) { OP_REQUIRES( - c, l->tensors.size() < l->max_num_elements, + c, l->tensors().size() < l->max_num_elements, errors::InvalidArgument("Tried to push item into a full list", - " list size: ", l->tensors.size(), + " list size: ", l->tensors().size(), " max_num_elements: ", l->max_num_elements)); } TensorList* output_list = nullptr; OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list)); - output_list->tensors.push_back(input); + output_list->tensors().push_back(input); } private: @@ -330,7 +338,7 @@ class TensorListLength : public OpKernel { OP_REQUIRES_OK(c, GetInputList(c, 0, &l)); Tensor* result; OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result)); - result->scalar()() = l->tensors.size(); + result->scalar()() = l->tensors().size(); } }; @@ -399,7 +407,7 @@ class TensorListReserve : public OpKernel { TensorList output; output.element_shape = element_shape; output.element_dtype = element_dtype_; - output.tensors.resize(num_elements, Tensor(DT_INVALID)); + output.tensors().resize(num_elements, Tensor(DT_INVALID)); Tensor* result; AllocatorAttributes attr; attr.set_on_host(true); @@ -440,32 +448,37 @@ class TensorListResize : public OpKernel { c->forward_input(0, 0, DT_VARIANT, TensorShape{}, c->input_memory_type(0), AllocatorAttributes()); if (maybe_result != nullptr) { - maybe_result->scalar()().get()->tensors.resize( - size, Tensor(DT_INVALID)); - c->set_output(0, *maybe_result); - } else { - Tensor* result; - AllocatorAttributes attr; - attr.set_on_host(true); - OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr)); - TensorList output_list; - output_list.element_shape = input_list->element_shape; - output_list.element_dtype = input_list->element_dtype; - output_list.max_num_elements = input_list->max_num_elements; - if (size > input_list->tensors.size()) { - output_list.tensors.insert(output_list.tensors.begin(), - input_list->tensors.begin(), - input_list->tensors.end()); - // Add DT_INVALID tensors to the end of the list if the requested size - // is larger than the list length. - output_list.tensors.resize(size, Tensor(DT_INVALID)); - } else { - output_list.tensors.insert(output_list.tensors.begin(), - input_list->tensors.begin(), - input_list->tensors.begin() + size); + TensorList* out = maybe_result->scalar()().get(); + if (out->RefCountIsOne()) { + // We are able to forward the input. + out->tensors().resize(size, Tensor(DT_INVALID)); + c->set_output(0, *maybe_result); + return; } - result->scalar()() = std::move(output_list); } + + // We were not able to forward the input. Will have to resize from scratch. + Tensor* result; + AllocatorAttributes attr; + attr.set_on_host(true); + OP_REQUIRES_OK(c, c->allocate_output(0, TensorShape{}, &result, attr)); + TensorList output_list; + output_list.element_shape = input_list->element_shape; + output_list.element_dtype = input_list->element_dtype; + output_list.max_num_elements = input_list->max_num_elements; + if (size > input_list->tensors().size()) { + output_list.tensors().insert(output_list.tensors().begin(), + input_list->tensors().begin(), + input_list->tensors().end()); + // Add DT_INVALID tensors to the end of the list if the requested size + // is larger than the list length. + output_list.tensors().resize(size, Tensor(DT_INVALID)); + } else { + output_list.tensors().insert(output_list.tensors().begin(), + input_list->tensors().begin(), + input_list->tensors().begin() + size); + } + result->scalar()() = std::move(output_list); } }; @@ -495,9 +508,9 @@ class TensorListSetItem : public OpKernel { " but list elements ", DataTypeString(l->element_dtype))); int32 index = c->input(1).scalar()(); - OP_REQUIRES(c, index < l->tensors.size(), + OP_REQUIRES(c, index < l->tensors().size(), errors::InvalidArgument("Trying to modify element ", index, - " in a list with ", l->tensors.size(), + " in a list with ", l->tensors().size(), " elements.")); const Tensor& value = c->input(2); OP_REQUIRES(c, l->element_shape.IsCompatibleWith(value.shape()), @@ -508,7 +521,7 @@ class TensorListSetItem : public OpKernel { " list shape: ", l->element_shape.DebugString())); TensorList* output_list = nullptr; OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list)); - output_list->tensors[index] = value; + output_list->tensors()[index] = value; } private: @@ -560,11 +573,26 @@ class TensorListConcatLists : public OpKernel { const Tensor& tl_a = c->input(0); const Tensor& tl_b = c->input(1); - Tensor* output; - if (tl_alias) { - c->set_output(0, *tl_alias); - output = tl_alias.get(); - } else { + Tensor* output = nullptr; + bool ok_to_alias = tl_alias != nullptr; + if (tl_alias && tl_alias->dtype() == DT_VARIANT && + tl_alias->NumElements() > 0) { + auto tl_a_t = tl_alias->flat(); + for (int64 i = 0; i < tl_alias->NumElements(); ++i) { + TensorList* aliased = tl_a_t(i).get(); + if (aliased == nullptr || !aliased->RefCountIsOne()) { + ok_to_alias = false; + break; + } + } + if (ok_to_alias) { + c->set_output(0, *tl_alias); + output = tl_alias.get(); + } + } + if (!ok_to_alias) { + // Couldn't alias the entire Tensor. We'll be conservative and not try + // to alias individual batch entries. attr.set_on_host(true); OP_REQUIRES_OK(c, c->allocate_output(0, tl_a_shape, &output, attr)); } @@ -573,45 +601,42 @@ class TensorListConcatLists : public OpKernel { auto tl_a_t = tl_a.flat(); auto tl_b_t = tl_b.flat(); - for (int64 b = 0; b < tl_a.NumElements(); ++b) { - const TensorList* l_a = tl_a_t(b).get(); - const TensorList* l_b = tl_b_t(b).get(); + for (int64 i = 0; i < tl_a.NumElements(); ++i) { + const TensorList* l_a = tl_a_t(i).get(); + const TensorList* l_b = tl_b_t(i).get(); OP_REQUIRES( c, l_a != nullptr, - errors::InvalidArgument("input_a is not a TensorList at index ", b, - ". Saw: '", tl_a_t(b).DebugString(), "'")); + errors::InvalidArgument("input_a is not a TensorList at index ", i, + ". Saw: '", tl_a_t(i).DebugString(), "'")); OP_REQUIRES( c, l_b != nullptr, - errors::InvalidArgument("input_b is not a TensorList at index ", b, - ". Saw: '", tl_b_t(b).DebugString(), "'")); + errors::InvalidArgument("input_b is not a TensorList at index ", i, + ". Saw: '", tl_b_t(i).DebugString(), "'")); OP_REQUIRES(c, l_a->element_dtype == element_dtype_, errors::InvalidArgument( - "input_a[", b, "].dtype != element_dtype. Saw: ", + "input_a[", i, "].dtype != element_dtype. Saw: ", DataTypeString(l_a->element_dtype), " vs. ", DataTypeString(element_dtype_))); OP_REQUIRES(c, l_b->element_dtype == element_dtype_, errors::InvalidArgument( - "input_b[", b, "].dtype != element_dtype. Saw: ", + "input_b[", i, "].dtype != element_dtype. Saw: ", DataTypeString(l_b->element_dtype), " vs. ", DataTypeString(element_dtype_))); OP_REQUIRES(c, l_a->element_shape.IsIdenticalTo(l_b->element_shape), errors::InvalidArgument( "input_a and input_b TensorList element shapes are not " "identical at index ", - b, ". Saw ", l_a->element_shape.DebugString(), " vs. ", + i, ". Saw ", l_a->element_shape.DebugString(), " vs. ", l_b->element_shape.DebugString())); - if (tl_alias) { - TensorList* out = output_t(b).get(); - DCHECK(out != nullptr) << "Expected output to alias input_a, but it " - "doesn't contain a TensorList at index " - << b; - std::copy(l_b->tensors.begin(), l_b->tensors.end(), - std::back_inserter(out->tensors)); + if (ok_to_alias) { + TensorList* out = output_t(i).get(); + std::copy(l_b->tensors().begin(), l_b->tensors().end(), + std::back_inserter(out->tensors())); } else { - TensorList out = *l_a; - std::copy(l_b->tensors.begin(), l_b->tensors.end(), - std::back_inserter(out.tensors)); - output_t(b) = std::move(out); + TensorList out = l_a->Copy(); + std::copy(l_b->tensors().begin(), l_b->tensors().end(), + std::back_inserter(out.tensors())); + output_t(i) = std::move(out); } } } diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h index a33ca1cee19..3a6b553f7a8 100644 --- a/tensorflow/core/kernels/list_kernels.h +++ b/tensorflow/core/kernels/list_kernels.h @@ -31,7 +31,9 @@ limitations under the License. #include "tensorflow/core/kernels/fill_functor.h" #include "tensorflow/core/lib/core/coding.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/refcount.h" #include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/platform/platform.h" #include "tensorflow/core/util/tensor_ops_util.h" #include "tensorflow/core/util/util.h" @@ -41,12 +43,85 @@ typedef Eigen::ThreadPoolDevice CPUDevice; // Variant compatible type for a list of tensors. This is mutable but instances // should never be mutated after stored in a variant tensor. -struct TensorList { +// +// **NOTE**: TensorList stores a refcounted container of tf::Tensor objects, +// which are accessible via TensorList::tensors(). Because it is refcounted, +// straight copies of the form: +// +// TensorList b = a; +// b.tensors().push_back(t); // WARNING: This modifies a.tensors(). +// +// Do not create a true copy of the underlying container - but instead increment +// a reference count. Modifying b.tensors() modifies a.tensors(). In this way, +// TensorList should be considered similar to the tf::Tensor object. +// +// In order to get a copy of the underlying list, use the Copy method: +// +// TensorList b = a.Copy(); +// b.tensors().push_back(t); // This does not modify a.tensors(). +// +// Note that this is not a deep copy: the memory locations of the underlying +// tensors will still point to the same locations of the corresponding tensors +// in the original. To truly perform a deep copy, Device and Type-specific +// code needs to be applied to the underlying tensors as usual. +// +// The most important implication of RefCounted TLs is that OpKernels +// wishing to reuse TensorList inputs as outputs via context->forward_input() +// need to perform an additional check on the refcount of the TensorList, +// to ensure aliasing can be performed safely. For example: +// +// bool can_alias = false; +// auto fw = c->forward_input(..., DT_VARIANT, {}, ...); +// if (fw && fw->dtype() == DT_VARIANT && fw->NumElements() == 1) { +// auto* tl = fw->scalar()().get(); +// if (tl && tl->RefCountIsOne()) { +// can_alias = true; +// } +// } +// +class TensorList { public: - TensorList() {} - TensorList(const TensorList& other); + TensorList() : tensors_(new Tensors) {} + ~TensorList(); + + TensorList(const TensorList& other) + : element_shape(other.element_shape), + element_dtype(other.element_dtype), + max_num_elements(other.max_num_elements), + tensors_(other.tensors_) { + tensors_->Ref(); + } + + TensorList(TensorList&& rhs) + : element_shape(std::move(rhs.element_shape)), + element_dtype(rhs.element_dtype), + max_num_elements(rhs.max_num_elements), + tensors_(rhs.tensors_) { + rhs.tensors_ = nullptr; + } + + TensorList& operator=(const TensorList& rhs) { + if (this == &rhs) return *this; + element_shape = rhs.element_shape; + element_dtype = rhs.element_dtype; + max_num_elements = rhs.max_num_elements; + tensors_->Unref(); + tensors_ = rhs.tensors_; + tensors_->Ref(); + return *this; + } + + TensorList& operator=(TensorList&& rhs) { + if (this == &rhs) return *this; + element_shape = rhs.element_shape; + element_dtype = rhs.element_dtype; + max_num_elements = rhs.max_num_elements; + std::swap(tensors_, rhs.tensors_); + return *this; + } static const char kTypeName[]; + string TypeName() const { return kTypeName; } void Encode(VariantTensorData* data) const; @@ -56,14 +131,47 @@ struct TensorList { // TODO(apassos) fill this out string DebugString() const { return "TensorList"; } - std::vector tensors; PartialTensorShape element_shape; + DataType element_dtype; + // The maximum allowed size of `tensors`. Defaults to -1 meaning that the size // of `tensors` is unbounded. int max_num_elements = -1; + + // Access to the underlying tensor container. + std::vector& tensors() { return tensors_->values_; } + const std::vector& tensors() const { return tensors_->values_; } + + // Get a new TensorList containing a copy of the underlying tensor container. + TensorList Copy() const { + TensorList out; + out.element_shape = element_shape; + out.element_dtype = element_dtype; + out.max_num_elements = max_num_elements; + // This performs a copy of the std::vector. + out.tensors_->values_ = tensors_->values_; + return out; + } + + // Is this TensorList the only one with a reference to the underlying + // container? + bool RefCountIsOne() const { return tensors_->RefCountIsOne(); } + + private: + class Tensors : public core::RefCounted { + public: + std::vector values_; + }; + Tensors* tensors_; }; +#if defined(PLATFORM_GOOGLE) +// TODO(ebrevdo): Identify why Variant inline size is smaller on mobile devices. +static_assert(Variant::CanInlineType(), + "Must be able to inline TensorList into a Variant"); +#endif + Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out); Status GetElementShapeFromInput(OpKernelContext* c, @@ -96,18 +204,19 @@ class TensorListStack : public OpKernel { "Invalid data types; op elements ", DataTypeString(element_dtype_), " but list elements ", DataTypeString(tensor_list->element_dtype))); if (num_elements_ != -1) { - OP_REQUIRES(c, tensor_list->tensors.size() == num_elements_, + OP_REQUIRES(c, tensor_list->tensors().size() == num_elements_, errors::InvalidArgument( "Operation expected a list with ", num_elements_, " elements but got a list with ", - tensor_list->tensors.size(), " elements.")); + tensor_list->tensors().size(), " elements.")); } PartialTensorShape partial_element_shape; OP_REQUIRES_OK(c, GetElementShapeFromInput(c, *tensor_list, 1, &partial_element_shape)); OP_REQUIRES( c, - partial_element_shape.IsFullyDefined() || !tensor_list->tensors.empty(), + partial_element_shape.IsFullyDefined() || + !tensor_list->tensors().empty(), errors::InvalidArgument("Tried to stack elements of an empty ", "list with non-fully-defined element_shape: ", partial_element_shape.DebugString())); @@ -115,8 +224,8 @@ class TensorListStack : public OpKernel { // Check that `element_shape` input tensor is compatible with the shapes of // element tensors. if (!tensor_list->element_shape.IsFullyDefined()) { - for (int i = 0; i < tensor_list->tensors.size(); ++i) { - const Tensor& t = tensor_list->tensors[i]; + for (int i = 0; i < tensor_list->tensors().size(); ++i) { + const Tensor& t = tensor_list->tensors()[i]; if (t.dtype() != DT_INVALID) { PartialTensorShape tmp = partial_element_shape; OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape)); @@ -133,7 +242,7 @@ class TensorListStack : public OpKernel { "tensors and has a non-fully-defined element_shape: ", partial_element_shape.DebugString())); TensorShape output_shape = element_shape; - output_shape.InsertDim(0, tensor_list->tensors.size()); + output_shape.InsertDim(0, tensor_list->tensors().size()); Tensor* output; OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output)); if (output->NumElements() == 0) { @@ -141,9 +250,9 @@ class TensorListStack : public OpKernel { } ConstMatrixVector inputs_flat; - inputs_flat.reserve(tensor_list->tensors.size()); + inputs_flat.reserve(tensor_list->tensors().size()); Tensor zeros; - for (const auto& t : tensor_list->tensors) { + for (const auto& t : tensor_list->tensors()) { if (t.dtype() != DT_INVALID) { inputs_flat.emplace_back(new typename TTypes::ConstMatrix( t.shaped({1, t.NumElements()}))); @@ -195,12 +304,12 @@ class TensorListGetItem : public OpKernel { " but list elements ", DataTypeString(l->element_dtype))); int32 index = c->input(1).scalar()(); - OP_REQUIRES(c, index < l->tensors.size(), + OP_REQUIRES(c, index < l->tensors().size(), errors::InvalidArgument("Trying to access element ", index, - " in a list with ", l->tensors.size(), + " in a list with ", l->tensors().size(), " elements.")); - if (l->tensors[index].dtype() != DT_INVALID) { - c->set_output(0, l->tensors[index]); + if (l->tensors()[index].dtype() != DT_INVALID) { + c->set_output(0, l->tensors()[index]); } else { PartialTensorShape partial_element_shape; OP_REQUIRES_OK( @@ -216,7 +325,7 @@ class TensorListGetItem : public OpKernel { // In that mode TensorArray sets the array's element_shape on the first // write call. We could do something similar here if needed. if (!partial_element_shape.IsFullyDefined()) { - for (const Tensor& t : l->tensors) { + for (const Tensor& t : l->tensors()) { if (t.dtype() != DT_INVALID) { PartialTensorShape tmp = partial_element_shape; OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape)); @@ -260,10 +369,10 @@ class TensorListPopBack : public OpKernel { " but list elements ", DataTypeString(l->element_dtype))); - OP_REQUIRES(c, !l->tensors.empty(), + OP_REQUIRES(c, !l->tensors().empty(), errors::InvalidArgument("Trying to pop from an empty list.")); - const Tensor& t = l->tensors.back(); + const Tensor& t = l->tensors().back(); if (t.dtype() != DT_INVALID) { c->set_output(1, t); } else { @@ -288,7 +397,7 @@ class TensorListPopBack : public OpKernel { TensorList* output_list = nullptr; OP_REQUIRES_OK(c, ForwardInputOrCreateNewList(c, 0, 0, *l, &output_list)); - output_list->tensors.pop_back(); + output_list->tensors().pop_back(); } private: @@ -347,7 +456,7 @@ class TensorListConcat : public OpKernel { // If the TensorList is empty, element_shape_except_first_dim_ must be fully // defined. OP_REQUIRES(c, - !tensor_list->tensors.empty() || + !tensor_list->tensors().empty() || element_shape_except_first_dim_.IsFullyDefined(), errors::InvalidArgument( "All except the first dimension must be fully defined ", @@ -364,8 +473,8 @@ class TensorListConcat : public OpKernel { if (!tensor_list->element_shape.IsFullyDefined()) { bool check_dim = (first_dim == -1); int64 inferred_first_dim = first_dim; - for (int i = 0; i < tensor_list->tensors.size(); ++i) { - const Tensor& t = tensor_list->tensors[i]; + for (int i = 0; i < tensor_list->tensors().size(); ++i) { + const Tensor& t = tensor_list->tensors()[i]; if (t.dtype() != DT_INVALID) { PartialTensorShape tmp = element_shape_except_first_dim_; OP_REQUIRES( @@ -407,14 +516,14 @@ class TensorListConcat : public OpKernel { OP_REQUIRES_OK( c, c->allocate_output( - 1, TensorShape({static_cast(tensor_list->tensors.size())}), + 1, TensorShape({static_cast(tensor_list->tensors().size())}), &lengths_tensor)); auto lengths_tensor_vec = lengths_tensor->vec(); int64 leading_dim = 0; - for (size_t i = 0; i < tensor_list->tensors.size(); i++) { + for (size_t i = 0; i < tensor_list->tensors().size(); i++) { int64 dim; - if (tensor_list->tensors[i].dtype() != DT_INVALID) { - dim = tensor_list->tensors[i].shape().dim_size(0); + if (tensor_list->tensors()[i].dtype() != DT_INVALID) { + dim = tensor_list->tensors()[i].shape().dim_size(0); } else { // If leading_dims is not provided or does not contain an entry for // index i use the inferred `first_dim` if set. @@ -449,12 +558,12 @@ class TensorListConcat : public OpKernel { } ConstMatrixVector inputs_flat; - inputs_flat.reserve(tensor_list->tensors.size()); + inputs_flat.reserve(tensor_list->tensors().size()); // Store the zeros tensors in a vector to prevent them from being GC'ed till // concat is complete. std::vector zeros_vec; - for (int i = 0; i < tensor_list->tensors.size(); i++) { - const Tensor& element_tensor = tensor_list->tensors[i]; + for (int i = 0; i < tensor_list->tensors().size(); i++) { + const Tensor& element_tensor = tensor_list->tensors()[i]; if (element_tensor.dtype() != DT_INVALID) { inputs_flat.emplace_back(new typename TTypes::ConstMatrix( element_tensor.shaped({1, element_tensor.NumElements()}))); @@ -536,7 +645,7 @@ class TensorListSplit : public OpKernel { errors::InvalidArgument( "Expected lengths to be a vector, received shape: ", lengths.shape().DebugString())); - output_list.tensors.reserve(lengths.shape().dim_size(0)); + output_list.tensors().reserve(lengths.shape().dim_size(0)); int64 start = 0; int64 end = 0; for (int i = 0; i < lengths.shape().dim_size(0); ++i) { @@ -557,7 +666,7 @@ class TensorListSplit : public OpKernel { OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned)); aligned.flat().device(c->eigen_device()) = tmp.unaligned_flat(); - output_list.tensors.emplace_back(aligned); + output_list.tensors().emplace_back(aligned); } OP_REQUIRES(c, end == input_tensor.shape().dim_size(0), errors::InvalidArgument( @@ -599,7 +708,7 @@ class TensorListGather : public OpKernel { if (!tensor_list->element_shape.IsFullyDefined()) { for (int index = 0; index < indices.NumElements(); ++index) { const int i = indices.flat()(index); - const Tensor& t = tensor_list->tensors[i]; + const Tensor& t = tensor_list->tensors()[i]; if (t.dtype() != DT_INVALID) { PartialTensorShape tmp = partial_element_shape; OP_REQUIRES_OK(c, tmp.MergeWith(t.shape(), &partial_element_shape)); @@ -629,10 +738,10 @@ class TensorListGather : public OpKernel { for (int index = 0; index < indices.NumElements(); ++index) { const int i = indices.flat()(index); OP_REQUIRES( - c, i < tensor_list->tensors.size(), + c, i < tensor_list->tensors().size(), errors::InvalidArgument("Index ", i, " out o range; list only has ", - tensor_list->tensors.size(), " elements.")); - const Tensor& t = tensor_list->tensors[i]; + tensor_list->tensors().size(), " elements.")); + const Tensor& t = tensor_list->tensors()[i]; if (t.dtype() != DT_INVALID) { inputs_flat.emplace_back(new typename TTypes::ConstMatrix( t.shaped({1, t.NumElements()}))); @@ -693,7 +802,7 @@ class TensorListFromTensor : public OpKernel { "Specified a list with shape ", element_shape.DebugString(), " from a tensor with shape ", output_shape.DebugString())); output_list.element_shape = element_shape; - output_list.tensors.reserve(t.shape().dim_size(0)); + output_list.tensors().reserve(t.shape().dim_size(0)); for (int i = 0; i < t.shape().dim_size(0); ++i) { Tensor tmp = t.Slice(i, i + 1); TensorShape tmp_shape = tmp.shape(); @@ -706,7 +815,7 @@ class TensorListFromTensor : public OpKernel { OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned)); aligned.flat().device(c->eigen_device()) = tmp.unaligned_flat(); - output_list.tensors.push_back(aligned); + output_list.tensors().push_back(aligned); } output_tensor->scalar()() = std::move(output_list); } @@ -732,7 +841,7 @@ Status Scatter(OpKernelContext* c, const Tensor& value, const Tensor& indices, // many small ones. aligned.flat().device(c->eigen_device()) = tmp.unaligned_flat(); - std::swap(list->tensors[i], aligned); + std::swap(list->tensors()[i], aligned); } return Status::OK(); } @@ -777,8 +886,8 @@ class TensorListScatterIntoExistingList : public OpKernel { ? -1 : *std::max_element(indices_vec.data(), indices_vec.data() + indices.NumElements()); - if (max_index + 1 > output_list->tensors.size()) { - output_list->tensors.resize(max_index + 1); + if (max_index + 1 > output_list->tensors().size()) { + output_list->tensors().resize(max_index + 1); } // Scatter the values. @@ -845,8 +954,8 @@ class TensorListScatter : public OpKernel { highest_index = i; } } - output_list.tensors.resize(std::max(highest_index + 1, num_elements), - Tensor(DT_INVALID)); + output_list.tensors().resize(std::max(highest_index + 1, num_elements), + Tensor(DT_INVALID)); } OP_REQUIRES_OK(c, @@ -875,19 +984,19 @@ Status TensorListBinaryAdd(OpKernelContext* c, const TensorList& a, TF_RETURN_IF_ERROR( a.element_shape.MergeWith(b.element_shape, &out->element_shape)); - if (a.tensors.size() != b.tensors.size()) { + if (a.tensors().size() != b.tensors().size()) { return errors::InvalidArgument( "Trying to add two lists of tensors with different lengths. One is ", - a.tensors.size(), " and the other is ", b.tensors.size()); + a.tensors().size(), " and the other is ", b.tensors().size()); } - out->tensors.reserve(a.tensors.size()); - for (int i = 0; i < a.tensors.size(); ++i) { - const Tensor& a_tensor = a.tensors[i]; - const Tensor& b_tensor = b.tensors[i]; + out->tensors().reserve(a.tensors().size()); + for (int i = 0; i < a.tensors().size(); ++i) { + const Tensor& a_tensor = a.tensors()[i]; + const Tensor& b_tensor = b.tensors()[i]; Tensor out_tensor; TF_RETURN_IF_ERROR( BinaryAddTensors(c, a_tensor, b_tensor, &out_tensor)); - out->tensors.push_back(out_tensor); + out->tensors().push_back(out_tensor); } return Status::OK(); } @@ -897,11 +1006,11 @@ Status TensorListZerosLike(OpKernelContext* c, const TensorList& x, TensorList* y) { y->element_dtype = x.element_dtype; y->element_shape = x.element_shape; - y->tensors.reserve(x.tensors.size()); - for (const Tensor& t : x.tensors) { + y->tensors().reserve(x.tensors().size()); + for (const Tensor& t : x.tensors()) { Tensor out_tensor; TF_RETURN_IF_ERROR(ZerosLikeTensor(c, t, &out_tensor)); - y->tensors.emplace_back(out_tensor); + y->tensors().emplace_back(out_tensor); } return Status::OK(); } @@ -936,7 +1045,19 @@ class TensorListPushBackBatch : public OpKernel { 0 /*input_index*/, 0 /*output_index*/, DT_VARIANT, tls_shape, DEVICE_MEMORY /* input is always on DEVICE_MEMORY */, attr); - const Tensor& tls = tls_alias ? *tls_alias : c->input(0); + bool ok_to_alias = tls_alias != nullptr; + if (tls_alias && tls_alias->dtype() == DT_VARIANT && + tls_alias->NumElements() > 0) { + auto alias_t = tls_alias->flat(); + for (int i = 0; i < tls_alias->NumElements(); ++i) { + TensorList* tl_i = alias_t(i).get(); + if (tl_i == nullptr || !tl_i->RefCountIsOne()) { + ok_to_alias = false; + break; + } + } + } + const Tensor& tls = ok_to_alias ? *tls_alias : c->input(0); OP_REQUIRES(c, tls.dtype() == DT_VARIANT, errors::InvalidArgument( @@ -979,7 +1100,7 @@ class TensorListPushBackBatch : public OpKernel { Tensor* result; - if (tls_alias) { + if (ok_to_alias) { result = tls_alias.get(); c->set_output(0, *result); } else { @@ -998,8 +1119,8 @@ class TensorListPushBackBatch : public OpKernel { auto result_t = result->vec(); for (int64 b = 0; b < batch_size; ++b) { - if (!tls_alias) { - result_t(b) = *tl_batch[b]; + if (!ok_to_alias) { + result_t(b) = tl_batch[b]->Copy(); } TensorList* output = result_t(b).get(); DCHECK(output != nullptr); @@ -1011,7 +1132,7 @@ class TensorListPushBackBatch : public OpKernel { auto frame_t = frame->flat(); frame_t.device(c->eigen_device()) = input_t.template chip<0>(b); } - output->tensors.push_back(std::move(*frame)); + output->tensors().push_back(std::move(*frame)); } } diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py index 3c35b9767e9..f6046f425c5 100644 --- a/tensorflow/python/kernel_tests/list_ops_test.py +++ b/tensorflow/python/kernel_tests/list_ops_test.py @@ -53,7 +53,10 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase): max_num_elements=max_num_elements) l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0)) l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32) - self.assertAllEqual(self.evaluate(e), 1.0) + l = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32) + l, e = self.evaluate((l, e)) + self.assertAllEqual(l, []) + self.assertAllEqual(e, 1.0) @parameterized.named_parameters(("NoMaxNumElements", None), ("WithMaxNumElements", 2)) @@ -94,7 +97,10 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase): l = list_ops.tensor_list_reserve( element_dtype=dtypes.float32, element_shape=[2, 3], num_elements=3) _, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32) + l = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32) + l, e = self.evaluate((l, e)) self.assertAllEqual(e, np.zeros((2, 3))) + self.assertAllEqual(l, np.zeros((3, 2, 3))) def testPopUninitializedTensorUseSpecifiedElementShape(self): l = list_ops.tensor_list_reserve( @@ -954,14 +960,18 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase): l_concat_11 = list_ops.tensor_list_concat_lists( l_batch_1, l_batch_1, element_dtype=dtypes.float32) + expected_0 = [[1.0, 2.0], [-1.0]] + expected_1 = [[-1.0], [1.0, 2.0]] expected_00 = [[1.0, 2.0, 1.0, 2.0], [-1.0, -1.0]] expected_01 = [[1.0, 2.0, -1.0], [-1.0, 1.0, 2.0]] expected_10 = [[-1.0, 1.0, 2.0], [1.0, 2.0, -1.0]] expected_11 = [[-1.0, -1.0], [1.0, 2.0, 1.0, 2.0]] for i, (concat, expected) in enumerate(zip( - [l_concat_00, l_concat_01, l_concat_10, l_concat_11], - [expected_00, expected_01, expected_10, expected_11])): + [l_batch_0, l_batch_1, + l_concat_00, l_concat_01, l_concat_10, l_concat_11], + [expected_0, expected_1, + expected_00, expected_01, expected_10, expected_11])): splitted = array_ops.unstack(concat) splitted_stacked_ret = self.evaluate( (list_ops.tensor_list_stack(splitted[0], dtypes.float32), From ade316deef9fabf49029b3c906fec8d9d545ac34 Mon Sep 17 00:00:00 2001 From: Chao Mei Date: Mon, 22 Jul 2019 21:09:21 -0700 Subject: [PATCH 0374/3053] 1. Remove all references to tflite::cpu_backend_support as we no longer do reference-counting on the cpu backend context object and GetFromContext is moved to CpuBackendContext class as a static member function. 2. Remove gemmlowp_support.{h,cc} as their functionalities have already been folded into CpuBackendContext class. PiperOrigin-RevId: 259464967 --- tensorflow/lite/experimental/kernels/BUILD | 1 - .../kernels/unidirectional_sequence_gru.cc | 5 +- tensorflow/lite/kernels/BUILD | 19 +--- tensorflow/lite/kernels/conv.cc | 10 +-- .../lite/kernels/cpu_backend_context.cc | 26 ++++++ tensorflow/lite/kernels/cpu_backend_context.h | 2 + .../lite/kernels/cpu_backend_support.cc | 59 ------------- tensorflow/lite/kernels/cpu_backend_support.h | 34 -------- tensorflow/lite/kernels/depthwise_conv.cc | 10 +-- tensorflow/lite/kernels/fully_connected.cc | 16 ++-- tensorflow/lite/kernels/gemmlowp_support.cc | 86 ------------------- tensorflow/lite/kernels/gemmlowp_support.h | 51 ----------- tensorflow/lite/kernels/lstm.cc | 10 +-- tensorflow/lite/kernels/reduce.cc | 6 +- tensorflow/lite/kernels/transpose_conv.cc | 6 +- 15 files changed, 52 insertions(+), 289 deletions(-) delete mode 100644 tensorflow/lite/kernels/cpu_backend_support.cc delete mode 100644 tensorflow/lite/kernels/cpu_backend_support.h delete mode 100644 tensorflow/lite/kernels/gemmlowp_support.cc delete mode 100644 tensorflow/lite/kernels/gemmlowp_support.h diff --git a/tensorflow/lite/experimental/kernels/BUILD b/tensorflow/lite/experimental/kernels/BUILD index aed87a2e643..e3d05ae4f51 100644 --- a/tensorflow/lite/experimental/kernels/BUILD +++ b/tensorflow/lite/experimental/kernels/BUILD @@ -106,7 +106,6 @@ cc_library( "//tensorflow/lite:framework", "//tensorflow/lite/c:c_api_internal", "//tensorflow/lite/kernels:cpu_backend_context", - "//tensorflow/lite/kernels:cpu_backend_support", "//tensorflow/lite/kernels:kernel_util", "//tensorflow/lite/kernels:op_macros", "//tensorflow/lite/kernels/internal:tensor", diff --git a/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc index fc0d681f3bc..9ef8107dc9f 100644 --- a/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc +++ b/tensorflow/lite/experimental/kernels/unidirectional_sequence_gru.cc @@ -18,7 +18,6 @@ limitations under the License. #include "tensorflow/lite/c/c_api_internal.h" #include "tensorflow/lite/experimental/kernels/gru_cell.h" #include "tensorflow/lite/kernels/cpu_backend_context.h" -#include "tensorflow/lite/kernels/cpu_backend_support.h" #include "tensorflow/lite/kernels/internal/tensor.h" #include "tensorflow/lite/kernels/kernel_util.h" @@ -112,14 +111,12 @@ enum TemporaryTensor { }; void* Init(TfLiteContext* context, const char* buffer, size_t length) { - cpu_backend_support::IncrementUsageCounter(context); auto* scratch_tensor_index = new int; context->AddTensors(context, kTemporaryNum, scratch_tensor_index); return scratch_tensor_index; } void Free(TfLiteContext* context, void* buffer) { - cpu_backend_support::DecrementUsageCounter(context); delete reinterpret_cast(buffer); } @@ -221,7 +218,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* output_state = GetOutput(context, node, kOutputState); TfLiteTensor* activation = GetTemporary(context, node, kActivation); TfLiteTensor* concat = GetTemporary(context, node, kConcat); - auto cpu_backend_context = cpu_backend_support::GetFromContext(context); + auto cpu_backend_context = CpuBackendContext::GetFromContext(context); if (gate_weight->type == kTfLiteFloat32) { GruImpl(input, input_state, gate_weight, gate_bias, candidate_weight, diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD index ee9090902ce..2b550c95f08 100644 --- a/tensorflow/lite/kernels/BUILD +++ b/tensorflow/lite/kernels/BUILD @@ -308,23 +308,6 @@ cc_test( ], ) -cc_library( - name = "cpu_backend_support", - srcs = [ - "cpu_backend_support.cc", - ], - hdrs = [ - "cpu_backend_support.h", - ], - copts = tflite_copts(), - deps = [ - ":cpu_backend_context", - ":op_macros", - "//tensorflow/lite:external_cpu_backend_context", - "//tensorflow/lite/c:c_api_internal", - ], -) - cc_library( name = "activation_functor", hdrs = [ @@ -483,7 +466,7 @@ cc_library( visibility = ["//visibility:private"], deps = [ ":activation_functor", - ":cpu_backend_support", + ":cpu_backend_context", ":eigen_support", ":kernel_util", ":lstm_eval", diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc index 072d6c6fc2c..6a42beab0f3 100644 --- a/tensorflow/lite/kernels/conv.cc +++ b/tensorflow/lite/kernels/conv.cc @@ -24,7 +24,7 @@ limitations under the License. #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/c_api_internal.h" -#include "tensorflow/lite/kernels/cpu_backend_support.h" +#include "tensorflow/lite/kernels/cpu_backend_context.h" #include "tensorflow/lite/kernels/eigen_support.h" // b/131835803 forces us to include multithreaded_conv.h before optimized_ops.h #ifndef TFLITE_WITH_RUY @@ -115,13 +115,11 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { // to carry information from Prepare() to Eval(). auto* data = new OpData; eigen_support::IncrementUsageCounter(context); - cpu_backend_support::IncrementUsageCounter(context); return data; } void Free(TfLiteContext* context, void* buffer) { eigen_support::DecrementUsageCounter(context); - cpu_backend_support::DecrementUsageCounter(context); delete reinterpret_cast(buffer); } @@ -472,7 +470,7 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node, GetTensorShape(bias), GetTensorData(bias), GetTensorShape(output), GetTensorData(output), GetTensorShape(im2col), GetTensorData(im2col), - cpu_backend_support::GetFromContext(context)); + CpuBackendContext::GetFromContext(context)); break; } } @@ -516,7 +514,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, GetTensorData(bias), GetTensorShape(output), GetTensorData(output), GetTensorShape(im2col), GetTensorData(im2col), - cpu_backend_support::GetFromContext(context)); + CpuBackendContext::GetFromContext(context)); break; } } @@ -564,7 +562,7 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node, GetTensorData(bias), GetTensorShape(output), GetTensorData(output), GetTensorShape(im2col), GetTensorData(im2col), - cpu_backend_support::GetFromContext(context)); + CpuBackendContext::GetFromContext(context)); break; } case kMultithreadOptimized: { diff --git a/tensorflow/lite/kernels/cpu_backend_context.cc b/tensorflow/lite/kernels/cpu_backend_context.cc index 63f12208630..0b38bb6998a 100644 --- a/tensorflow/lite/kernels/cpu_backend_context.cc +++ b/tensorflow/lite/kernels/cpu_backend_context.cc @@ -20,6 +20,32 @@ limitations under the License. namespace tflite { +CpuBackendContext* CpuBackendContext::GetFromContext(TfLiteContext* context) { + auto* external_context = static_cast( + context->GetExternalContext(context, kTfLiteCpuBackendContext)); + + if (external_context == nullptr) { + TF_LITE_FATAL( + "ExternalCpuBackendContext isn't properly initialized during TFLite " + "interpreter initialization."); + } + + auto* cpu_backend_context = static_cast( + external_context->internal_backend_context()); + if (cpu_backend_context == nullptr) { + // We do the lazy initialization here for the TfLiteInternalBackendContext + // that's wrapped inside ExternalCpuBackendContext. + cpu_backend_context = new CpuBackendContext(); + if (context->recommended_num_threads != -1) { + cpu_backend_context->SetMaxNumThreads(context->recommended_num_threads); + } + external_context->set_internal_backend_context( + std::unique_ptr(cpu_backend_context)); + } + + return cpu_backend_context; +} + CpuBackendContext::CpuBackendContext() : TfLiteInternalBackendContext(), ruy_context_(new ruy::Context), diff --git a/tensorflow/lite/kernels/cpu_backend_context.h b/tensorflow/lite/kernels/cpu_backend_context.h index a55a951ac99..c64eae2f6f3 100644 --- a/tensorflow/lite/kernels/cpu_backend_context.h +++ b/tensorflow/lite/kernels/cpu_backend_context.h @@ -26,6 +26,8 @@ namespace tflite { class CpuBackendContext final : public TfLiteInternalBackendContext { public: + static CpuBackendContext* GetFromContext(TfLiteContext* context); + CpuBackendContext(); ~CpuBackendContext() override; diff --git a/tensorflow/lite/kernels/cpu_backend_support.cc b/tensorflow/lite/kernels/cpu_backend_support.cc deleted file mode 100644 index ab47d5b7e99..00000000000 --- a/tensorflow/lite/kernels/cpu_backend_support.cc +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/kernels/cpu_backend_support.h" - -#include - -#include "tensorflow/lite/c/c_api_internal.h" -#include "tensorflow/lite/external_cpu_backend_context.h" -#include "tensorflow/lite/kernels/cpu_backend_context.h" -#include "tensorflow/lite/kernels/op_macros.h" - -namespace tflite { -namespace cpu_backend_support { - -// TODO(b/130950871): Remove all refrences to the following two no-op functions -// once the new ExternalCpuBackendContext class is checked in. -void IncrementUsageCounter(TfLiteContext* context) {} -void DecrementUsageCounter(TfLiteContext* context) {} - -CpuBackendContext* GetFromContext(TfLiteContext* context) { - auto* external_context = static_cast( - context->GetExternalContext(context, kTfLiteCpuBackendContext)); - - if (external_context == nullptr) { - TF_LITE_FATAL( - "ExternalCpuBackendContext isn't properly initialized during TFLite " - "interpreter initialization."); - } - - auto* cpu_backend_context = static_cast( - external_context->internal_backend_context()); - if (cpu_backend_context == nullptr) { - // We do the lazy initialization here for the TfLiteInternalBackendContext - // that's wrapped inside ExternalCpuBackendContext. - cpu_backend_context = new CpuBackendContext(); - if (context->recommended_num_threads != -1) { - cpu_backend_context->SetMaxNumThreads(context->recommended_num_threads); - } - external_context->set_internal_backend_context( - std::unique_ptr(cpu_backend_context)); - } - - return cpu_backend_context; -} - -} // namespace cpu_backend_support -} // namespace tflite diff --git a/tensorflow/lite/kernels/cpu_backend_support.h b/tensorflow/lite/kernels/cpu_backend_support.h deleted file mode 100644 index e7cec5cdd23..00000000000 --- a/tensorflow/lite/kernels/cpu_backend_support.h +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_KERNELS_CPU_BACKEND_SUPPORT_H_ -#define TENSORFLOW_LITE_KERNELS_CPU_BACKEND_SUPPORT_H_ - -#include "tensorflow/lite/c/c_api_internal.h" -#include "tensorflow/lite/kernels/cpu_backend_context.h" - -namespace tflite { - -namespace cpu_backend_support { - -CpuBackendContext* GetFromContext(TfLiteContext* context); - -void IncrementUsageCounter(TfLiteContext* context); - -void DecrementUsageCounter(TfLiteContext* context); - -} // namespace cpu_backend_support -} // namespace tflite - -#endif // TENSORFLOW_LITE_KERNELS_CPU_BACKEND_SUPPORT_H_ diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc index f3010549406..bfa3697c0a9 100644 --- a/tensorflow/lite/kernels/depthwise_conv.cc +++ b/tensorflow/lite/kernels/depthwise_conv.cc @@ -24,7 +24,7 @@ limitations under the License. #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/c_api_internal.h" -#include "tensorflow/lite/kernels/cpu_backend_support.h" +#include "tensorflow/lite/kernels/cpu_backend_context.h" #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h" #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h" #include "tensorflow/lite/kernels/internal/quantization_util.h" @@ -70,7 +70,6 @@ struct OpData { }; void* Init(TfLiteContext* context, const char* buffer, size_t length) { - cpu_backend_support::IncrementUsageCounter(context); // This is a builtin op, so we don't use the contents in 'buffer', if any. // Instead, we allocate a new object to carry information from Prepare() to // Eval(). @@ -78,7 +77,6 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { } void Free(TfLiteContext* context, void* buffer) { - cpu_backend_support::DecrementUsageCounter(context); delete reinterpret_cast(buffer); } @@ -207,7 +205,7 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node, GetTensorShape(filter), GetTensorData(filter), GetTensorShape(bias), GetTensorData(bias), GetTensorShape(output), GetTensorData(output), - cpu_backend_support::GetFromContext(context)); + CpuBackendContext::GetFromContext(context)); } } @@ -248,7 +246,7 @@ void EvalQuantized(TfLiteContext* context, TfLiteNode* node, GetTensorShape(filter), GetTensorData(filter), GetTensorShape(bias), GetTensorData(bias), GetTensorShape(output), GetTensorData(output), - cpu_backend_support::GetFromContext(context)); + CpuBackendContext::GetFromContext(context)); } } @@ -290,7 +288,7 @@ void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, GetTensorData(filter), GetTensorShape(bias), GetTensorData(bias), GetTensorShape(output), GetTensorData(output), - cpu_backend_support::GetFromContext(context)); + CpuBackendContext::GetFromContext(context)); } } diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc index bca595eb836..64da1533614 100644 --- a/tensorflow/lite/kernels/fully_connected.cc +++ b/tensorflow/lite/kernels/fully_connected.cc @@ -25,7 +25,7 @@ limitations under the License. #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/c_api_internal.h" #include "tensorflow/lite/kernels/activation_functor.h" -#include "tensorflow/lite/kernels/cpu_backend_support.h" +#include "tensorflow/lite/kernels/cpu_backend_context.h" #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/lite/kernels/internal/quantization_util.h" #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h" @@ -115,7 +115,6 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { // This is a builtin op, so we don't use the contents in 'buffer', if any. // Instead, we allocate a new object to carry information from Prepare() to // Eval(). - cpu_backend_support::IncrementUsageCounter(context); auto* op_data = new OpData(); context->AddTensors(context, /*tensors_to_add=*/2, &op_data->scratch_tensor_index); @@ -123,7 +122,6 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { } void Free(TfLiteContext* context, void* buffer) { - cpu_backend_support::DecrementUsageCounter(context); delete reinterpret_cast(buffer); } @@ -398,13 +396,13 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, GetTensorShape(filter), GetTensorData(filter), GetTensorShape(bias), GetTensorData(bias), GetTensorShape(output), GetTensorData(output), - cpu_backend_support::GetFromContext(context)); + CpuBackendContext::GetFromContext(context)); } break; case kTfLiteInt8: FullyConnectedInt8( data, input, filter, bias, output, - cpu_backend_support::GetFromContext(context)); + CpuBackendContext::GetFromContext(context)); break; case kTfLiteInt16: if (kernel_type == kReference) { @@ -419,7 +417,7 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, GetTensorShape(filter), GetTensorData(filter), GetTensorShape(bias), GetTensorData(bias), GetTensorShape(output), GetTensorData(output), - cpu_backend_support::GetFromContext(context)); + CpuBackendContext::GetFromContext(context)); } break; default: @@ -456,7 +454,7 @@ TfLiteStatus EvalShuffledQuantized(TfLiteContext* context, TfLiteNode* node, GetTensorShape(bias), GetTensorData(bias), \ GetTensorShape(output), GetTensorData(output), \ GetTensorData(shuffled_input_workspace), \ - cpu_backend_support::GetFromContext(context)); \ + CpuBackendContext::GetFromContext(context)); \ } FullyConnectedParams op_params; op_params.output_multiplier = data->output_multiplier; @@ -477,7 +475,7 @@ TfLiteStatus EvalShuffledQuantized(TfLiteContext* context, TfLiteNode* node, GetTensorShape(bias), GetTensorData(bias), GetTensorShape(output), GetTensorData(output), GetTensorData(shuffled_input_workspace), - cpu_backend_support::GetFromContext(context)); + CpuBackendContext::GetFromContext(context)); } #undef TF_LITE_SHUFFLED_FULLY_CONNECTED @@ -512,7 +510,7 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node, GetTensorShape(filter), GetTensorData(filter), GetTensorShape(bias), GetTensorData(bias), GetTensorShape(output), GetTensorData(output), - cpu_backend_support::GetFromContext(context)); + CpuBackendContext::GetFromContext(context)); } return kTfLiteOk; diff --git a/tensorflow/lite/kernels/gemmlowp_support.cc b/tensorflow/lite/kernels/gemmlowp_support.cc deleted file mode 100644 index 410a72ca3f6..00000000000 --- a/tensorflow/lite/kernels/gemmlowp_support.cc +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/kernels/gemmlowp_support.h" - -#include - -#include "tensorflow/lite/kernels/op_macros.h" - -namespace tflite { -namespace gemmlowp_support { -namespace { - -struct RefCountedGemmlowpContext : public TfLiteExternalContext { - std::unique_ptr gemmlowp_context; - int num_references = 0; -}; - -RefCountedGemmlowpContext* GetGemmLowpContext(TfLiteContext* context) { - return reinterpret_cast( - context->GetExternalContext(context, kTfLiteGemmLowpContext)); -} - -TfLiteStatus Refresh(TfLiteContext* context) { - auto* ptr = GetGemmLowpContext(context); - if (ptr != nullptr) { - ptr->gemmlowp_context->set_max_num_threads( - context->recommended_num_threads); - } - return kTfLiteOk; -} - -} // namespace - -void IncrementUsageCounter(TfLiteContext* context) { - auto* ptr = GetGemmLowpContext(context); - if (ptr == nullptr) { - ptr = new RefCountedGemmlowpContext; - ptr->type = kTfLiteGemmLowpContext; - ptr->Refresh = Refresh; - ptr->gemmlowp_context.reset(new gemmlowp::GemmContext()); - if (context->recommended_num_threads != -1) { - ptr->gemmlowp_context->set_max_num_threads( - context->recommended_num_threads); - } - ptr->num_references = 0; - context->SetExternalContext(context, kTfLiteGemmLowpContext, ptr); - } - ptr->num_references++; -} - -void DecrementUsageCounter(TfLiteContext* context) { - auto* ptr = GetGemmLowpContext(context); - if (ptr == nullptr) { - TF_LITE_FATAL( - "Call to DecrementUsageCounter() not preceded by " - "IncrementUsageCounter()"); - } - if (--ptr->num_references == 0) { - delete ptr; - context->SetExternalContext(context, kTfLiteGemmLowpContext, nullptr); - } -} - -gemmlowp::GemmContext* GetFromContext(TfLiteContext* context) { - auto* ptr = GetGemmLowpContext(context); - if (ptr == nullptr) { - TF_LITE_FATAL( - "Call to GetFromContext() not preceded by IncrementUsageCounter()"); - } - return ptr->gemmlowp_context.get(); -} - -} // namespace gemmlowp_support -} // namespace tflite diff --git a/tensorflow/lite/kernels/gemmlowp_support.h b/tensorflow/lite/kernels/gemmlowp_support.h deleted file mode 100644 index 9679326a533..00000000000 --- a/tensorflow/lite/kernels/gemmlowp_support.h +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_KERNELS_GEMMLOWP_SUPPORT_H_ -#define TENSORFLOW_LITE_KERNELS_GEMMLOWP_SUPPORT_H_ - -#include "public/gemmlowp.h" -#include "tensorflow/lite/c/c_api_internal.h" - -namespace tflite { -namespace gemmlowp_support { - -// Returns the GemmContext stored in 'context', allowing multiple ops to -// share a single object, as long as they share a TfLiteContext. The caller -// must ensure that this is called between IncrementUsageCounter() and -// DecrementUsageCounter(). For example, in the implementation of an op: -// void* Init(TfLiteContext* context, const char*, size_t) { -// gemmlowp_support::IncrementUsageCounter(context); -// return nullptr; -// } -// void Free(TfLiteContext* context, void*) { -// gemmlowp_support::DecrementUsageCounter(context); -// } -// TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { -// auto* gemmlowp_context = gemmlowp_support::GetFromContext(context); -// } -gemmlowp::GemmContext* GetFromContext(TfLiteContext* context); - -// Let the framework know that the GemmContext stored in 'context' will be used -// by an op. If necessary a new GemmContext is created and placed in 'context'. -void IncrementUsageCounter(TfLiteContext* context); - -// Let the framework know that the op stopped using the GemmContext stored in -// 'context'. If there are no more usages the GemmContext will be deleted. -void DecrementUsageCounter(TfLiteContext* context); - -} // namespace gemmlowp_support -} // namespace tflite - -#endif // TENSORFLOW_LITE_KERNELS_GEMMLOWP_SUPPORT_H_ diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc index 19ec80889e7..1dfd0a9dacc 100644 --- a/tensorflow/lite/kernels/lstm.cc +++ b/tensorflow/lite/kernels/lstm.cc @@ -23,7 +23,7 @@ limitations under the License. #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/c_api_internal.h" #include "tensorflow/lite/kernels/activation_functor.h" -#include "tensorflow/lite/kernels/cpu_backend_support.h" +#include "tensorflow/lite/kernels/cpu_backend_context.h" #include "tensorflow/lite/kernels/internal/kernel_utils.h" #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/lite/kernels/internal/tensor.h" @@ -796,7 +796,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { GetTensorShape(activation_out), GetTensorData(activation_out), GetTensorShape(concat_temp), GetTensorData(concat_temp), GetTensorShape(activation_temp), GetTensorData(activation_temp), - cpu_backend_support::GetFromContext(context)); + CpuBackendContext::GetFromContext(context)); } else if (input->type == kTfLiteUInt8 && prev_activation->type == kTfLiteUInt8 && weights->type == kTfLiteUInt8 && bias->type == kTfLiteInt32 && @@ -844,7 +844,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { GetTensorShape(concat_temp), GetTensorData(concat_temp), GetTensorShape(activation_temp), GetTensorData(activation_temp), - cpu_backend_support::GetFromContext(context)); + CpuBackendContext::GetFromContext(context)); } else { context->ReportError(context, "Unsupported combination of data types for LstmCell"); @@ -866,10 +866,8 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { const auto* params = reinterpret_cast(buffer); switch (params->kernel_type) { case kTfLiteLSTMFullKernel: - cpu_backend_support::IncrementUsageCounter(context); return full::Init(context, buffer, length); case kTfLiteLSTMBasicKernel: - cpu_backend_support::IncrementUsageCounter(context); return basic::Init(context, buffer, length); default: return nullptr; @@ -877,8 +875,6 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { return nullptr; } void Free(TfLiteContext* context, void* buffer) { - cpu_backend_support::DecrementUsageCounter(context); - delete reinterpret_cast(buffer); } diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc index d28ec70f98a..3474a403495 100644 --- a/tensorflow/lite/kernels/reduce.cc +++ b/tensorflow/lite/kernels/reduce.cc @@ -20,7 +20,7 @@ limitations under the License. #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/c_api_internal.h" -#include "tensorflow/lite/kernels/cpu_backend_support.h" +#include "tensorflow/lite/kernels/cpu_backend_context.h" #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/lite/kernels/internal/quantization_util.h" #include "tensorflow/lite/kernels/internal/reference/integer_ops/mean.h" @@ -62,7 +62,6 @@ struct OpContext { }; void* Init(TfLiteContext* context, const char* buffer, size_t length) { - cpu_backend_support::IncrementUsageCounter(context); // Creates two temp tensors to store index and axis for internal // implementation only. auto* op_data = new OpData(); @@ -71,7 +70,6 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) { } void Free(TfLiteContext* context, void* buffer) { - cpu_backend_support::DecrementUsageCounter(context); delete reinterpret_cast(buffer); } @@ -306,7 +304,7 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) { GetTensorData(op_context.output), op_context.output->params.zero_point, op_context.output->params.scale, - cpu_backend_support::GetFromContext(context)); + CpuBackendContext::GetFromContext(context)); } else { reference_ops::Mean(op_params, GetTensorShape(input), GetTensorData(input), diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc index 8bca828a1d9..c4447b2a468 100644 --- a/tensorflow/lite/kernels/transpose_conv.cc +++ b/tensorflow/lite/kernels/transpose_conv.cc @@ -21,7 +21,7 @@ limitations under the License. #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/c_api_internal.h" -#include "tensorflow/lite/kernels/cpu_backend_support.h" +#include "tensorflow/lite/kernels/cpu_backend_context.h" #include "tensorflow/lite/kernels/eigen_support.h" #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/lite/kernels/internal/tensor.h" @@ -86,13 +86,11 @@ struct OpData { void* Init(TfLiteContext* context, const char* buffer, size_t length) { auto* data = new OpData; eigen_support::IncrementUsageCounter(context); - cpu_backend_support::IncrementUsageCounter(context); return data; } void Free(TfLiteContext* context, void* buffer) { eigen_support::DecrementUsageCounter(context); - cpu_backend_support::DecrementUsageCounter(context); delete reinterpret_cast(buffer); } @@ -338,7 +336,7 @@ void EvalFloat(TfLiteContext* context, const TfLiteTransposeConvParams* params, GetTensorData(transposed_weights), GetTensorShape(output), GetTensorData(output), GetTensorShape(col2im), GetTensorData(col2im), - cpu_backend_support::GetFromContext(context)); + CpuBackendContext::GetFromContext(context)); break; } } From 1de23834beaa10e6f25e2c2f50a7e1c7ebe953b5 Mon Sep 17 00:00:00 2001 From: Smit Hinsu Date: Mon, 22 Jul 2019 21:51:39 -0700 Subject: [PATCH 0375/3053] Map TensorFlow StatelessIf and If op to a common If op in MLIR TensorFlow StatelessIf and If op only differs in the is_stateful property and are identical otherwise. Introduced an additional attribute in the MLIR op definition to differentiate them and mapped to and from the common op while importing and export to MLIR, respectively. Thanks Mehdi for the suggestion! PiperOrigin-RevId: 259468359 --- .../lite/tests/mlir2flatbuffer/if_op.mlir | 2 +- .../compiler/mlir/tensorflow/ir/tf_ops.td | 5 ++++- .../tests/functional-control-flow-to-cfg.mlir | 8 ++++---- .../graphdef2mlir/graph-function-defs.pbtxt | 2 +- .../graph-function-static-output.pbtxt | 2 +- .../mlir/tensorflow/tests/tf-ops.mlir | 19 ++++++++++++------- .../tensorflow/translate/import_graphdef.cc | 15 +++++++++++++-- .../mlir/tensorflow/utils/export_utils.cc | 14 ++++++++++++++ 8 files changed, 50 insertions(+), 17 deletions(-) diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir index 03048bd640d..726441876cd 100644 --- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir +++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/if_op.mlir @@ -160,7 +160,7 @@ func @main(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) -> tensor<1xf32> { %0 = "tfl.pseudo_input"(%arg0) : (tensor<1xf32>) -> tensor<1xf32> %1 = "tfl.pseudo_input"(%arg1) : (tensor<1xf32>) -> tensor<1xf32> %2 = "tfl.less"(%0, %1) : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xi1> - %3 = "tf.If"(%2, %0, %1) {else_branch = @cond_false, then_branch = @cond_true} : (tensor<1xi1>, tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32> + %3 = "tf.If"(%2, %0, %1) {else_branch = @cond_false, then_branch = @cond_true, is_stateless = false} : (tensor<1xi1>, tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32> return %3 : tensor<1xf32> } diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td index d920f471bbf..a803826cc66 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td @@ -103,7 +103,10 @@ else_branch: A function that takes 'inputs' and returns a list of SymbolRefAttr:$then_branch, SymbolRefAttr:$else_branch, - DefaultValuedAttr:$output_shapes + DefaultValuedAttr:$output_shapes, + + // Used to map StatelessIf and If op defined in TensorFlow to a common op. + BoolAttr:$is_stateless ); let results = (outs diff --git a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir index 82fc0171fa6..79f471b3869 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir +++ b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir @@ -7,7 +7,7 @@ func @testIf1Else(tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32> func @testIf1Result(tensor, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32> { ^bb0(%arg0: tensor, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>): %1 = "tf.If"(%arg0, %arg1, %arg2) { - then_branch = @testIf1Then, else_branch = @testIf1Else + then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false } : (tensor, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32> // CHECK: %0 = extract_element %arg0[] : tensor @@ -31,7 +31,7 @@ func @testIf3Else(tensor<*xf32>) -> (tensor<*xf32>, tensor<*xi8>, tensor<*xbf16> func @testIf3Result(tensor, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xi8>, tensor<*xbf16>) { ^bb0(%arg0: tensor, %arg1: tensor<*xf32>): %1:3 = "tf.If"(%arg0, %arg1) { - then_branch = @testIf3Then, else_branch = @testIf3Else + then_branch = @testIf3Then, else_branch = @testIf3Else, is_stateless = false } : (tensor, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xi8>, tensor<*xbf16>) // CHECK: %0 = extract_element %arg0[] : tensor @@ -57,7 +57,7 @@ func @testIf1Casts(tensor, tensor<2x2xf32>, tensor<*xf32>) -> tensor<2x?xf32 ^bb0(%arg0: tensor, %arg1: tensor<2x2xf32>, %arg2: tensor<*xf32>): %1 = "tf.If"(%arg0, %arg1, %arg2) { - then_branch = @testIf1Then, else_branch = @testIf1Else + then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false } : (tensor, tensor<2x2xf32>, tensor<*xf32>) -> tensor<2x?xf32> // CHECK: %0 = extract_element %arg0[] : tensor @@ -97,7 +97,7 @@ func @testIf1x4(tensor<4xi1>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32> { // expected-error @+1 {{only supports zero-D bool tensors now}} %1 = "tf.If"(%arg0, %arg1, %arg2) { - then_branch = @testIf1Then, else_branch = @testIf1Else + then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false } : (tensor<4xi1>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32> return %1 : tensor<*xf32> diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt index 2488716e913..249a1efa952 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt @@ -517,7 +517,7 @@ versions { # CHECK-NEXT: %9:2 = "_tf.Identity"(%2#0, %7) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "", name = "replicated_input_0"} : (tensor<*xi32>, !_tf.control) -> (tensor<*xi32>, !_tf.control) # CHECK-NEXT: %10:2 = "_tf.Identity"(%4#0, %7) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "", name = "replicated_input_1"} : (tensor<*xi32>, !_tf.control) -> (tensor<*xi32>, !_tf.control) # CHECK-NEXT: %11:2 = "_tf.Less"(%9#0, %10#0) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "", name = "Less"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi1>, !_tf.control) -# CHECK-NEXT: %12:3 = "_tf.If"(%11#0, %10#0, %9#0) {Tcond = "tfdtype$DT_BOOL", Tin = ["tfdtype$DT_INT32", "tfdtype$DT_INT32"], Tout = ["tfdtype$DT_INT32", "tfdtype$DT_INT32"], _tpu_replicate = "cluster", device = "", else_branch = @cond_false0, name = "cond", output_shapes = ["tfshape$unknown_rank: true\0A", "tfshape$unknown_rank: true\0A"], then_branch = @cond_true0, then_branch.how_many = 32 : i64, then_branch.ping = "ack"} : (tensor<*xi1>, tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>, !_tf.control) +# CHECK-NEXT: %12:3 = "_tf.If"(%11#0, %10#0, %9#0) {Tcond = "tfdtype$DT_BOOL", Tin = ["tfdtype$DT_INT32", "tfdtype$DT_INT32"], Tout = ["tfdtype$DT_INT32", "tfdtype$DT_INT32"], _tpu_replicate = "cluster", device = "", else_branch = @cond_false0, is_stateless = false, name = "cond", output_shapes = ["tfshape$unknown_rank: true\0A", "tfshape$unknown_rank: true\0A"], then_branch = @cond_true0, then_branch.how_many = 32 : i64, then_branch.ping = "ack"} : (tensor<*xi1>, tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>, !_tf.control) # CHECK-NEXT: %13:2 = "_tf.Identity"(%12#0) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "/device:TPU_REPLICATED_CORE:0", name = "Identity"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) # CHECK-NEXT: %14:2 = "_tf.TPUReplicatedOutput"(%13#0) {T = "tfdtype$DT_INT32", device = "", name = "output0", num_replicas = 1 : i64} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) # CHECK-NEXT: %15:2 = "_tf.Identity"(%14#0, %6) {T = "tfdtype$DT_INT32", device = "", name = "output_0_shard_0"} : (tensor<*xi32>, !_tf.control) -> (tensor<*xi32>, !_tf.control) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-static-output.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-static-output.pbtxt index 41107cfbff4..3ddbf783d64 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-static-output.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-static-output.pbtxt @@ -142,7 +142,7 @@ versions { #CHECK: func @main() { #CHECK-NEXT: %0:2 = "_tf.Placeholder"() {device = "", dtype = "tfdtype$DT_BOOL", name = "Placeholder", shape = "tfshape$unknown_rank: true\0A"} : () -> (tensor<*xi1>, !_tf.control) #CHECK-NEXT: %1:2 = "_tf.Placeholder"() {device = "", dtype = "tfdtype$DT_INT32", name = "Placeholder_1", shape = "tfshape$unknown_rank: true\0A"} : () -> (tensor<*xi32>, !_tf.control) -#CHECK-NEXT: %2:2 = "_tf.If"(%0#0, %1#0) {Tcond = "tfdtype$DT_BOOL", Tin = ["tfdtype$DT_INT32"], Tout = ["tfdtype$DT_INT32"], device = "", else_branch = @get_zeros0, name = "If", output_shapes = [], then_branch = @identity0} : (tensor<*xi1>, tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) +#CHECK-NEXT: %2:2 = "_tf.If"(%0#0, %1#0) {Tcond = "tfdtype$DT_BOOL", Tin = ["tfdtype$DT_INT32"], Tout = ["tfdtype$DT_INT32"], device = "", else_branch = @get_zeros0, is_stateless = false, name = "If", output_shapes = [], then_branch = @identity0} : (tensor<*xi1>, tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) #CHECK-NEXT: return #CHECK-NEXT: } #CHECK: func @get_zeros0(%arg0: tensor<*xi32>) -> tensor<2xi32> { diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir index f1c480049e3..d37892dd5df 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir +++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir @@ -486,7 +486,7 @@ func @testIfElse(tensor<*xf32>) -> tensor<*xf32> func @testValidIfOp(tensor, tensor<2xf32>) -> tensor<2xf32> { ^bb0(%arg0: tensor, %arg1: tensor<2xf32>): %1 = "tf.If"(%arg0, %arg1) { - then_branch = @testIfThen, else_branch = @testIfElse + then_branch = @testIfThen, else_branch = @testIfElse, is_stateless = false } : (tensor, tensor<2xf32>) -> tensor<2xf32> return %1 : tensor<2xf32> @@ -503,7 +503,8 @@ func @testInvalidIfOp(tensor, f32) -> f32 { // expected-error @+1 {{operand #1 must be tensor of tf.dtype values}} %1 = "tf.If"(%arg0, %arg1) { then_branch = @testIfThen, - else_branch = @testIfElse + else_branch = @testIfElse, + is_stateless = false } : (tensor, f32) -> f32 return %1 : f32 @@ -518,7 +519,7 @@ func @testInvalidIfOp(tensor, tensor<2xf32>) -> tensor<2xf32> { ^bb0(%arg0: tensor, %arg1: tensor<2xf32>): // expected-error @+1 {{requires attribute 'then_branch'}} %1 = "tf.If"(%arg0, %arg1) { - else_branch = @testIfElse + else_branch = @testIfElse, is_stateless = false } : (tensor, tensor<2xf32>) -> tensor<2xf32> return %1 : tensor<2xf32> @@ -535,7 +536,8 @@ func @testInvalidIfOp(tensor, tensor<2xf32>) -> tensor<2xf32> { // expected-error @+1 {{branches should have 1 inputs}} %1 = "tf.If"(%arg0, %arg1) { then_branch = @testIfThen, - else_branch = @testIfElse + else_branch = @testIfElse, + is_stateless = false } : (tensor, tensor<2xf32>) -> tensor<2xf32> return %1 : tensor<2xf32> @@ -552,7 +554,8 @@ func @testInvalidIfOp(tensor, tensor<2xf32>) -> tensor<2xf32> { // expected-error @+1 {{then branch input type tensor<*xf16> is incompatible with operand type tensor<2xf32>}} %1 = "tf.If"(%arg0, %arg1) { then_branch = @testIfThen, - else_branch = @testIfElse + else_branch = @testIfElse, + is_stateless = false } : (tensor, tensor<2xf32>) -> tensor<2xf32> return %1 : tensor<2xf32> @@ -569,7 +572,8 @@ func @testInvalidIfOp(tensor, tensor<*xf32>) -> tensor<2xf32> { // expected-error @+1 {{branches inputs have incompatible types tensor<2xf32> and tensor<3xf32>}} %1 = "tf.If"(%arg0, %arg1) { then_branch = @testIfThen, - else_branch = @testIfElse + else_branch = @testIfElse, + is_stateless = false } : (tensor, tensor<*xf32>) -> tensor<2xf32> return %1 : tensor<2xf32> @@ -586,7 +590,8 @@ func @testInvalidIfOp(tensor, tensor<*xf32>) -> tensor<2xf32> { // expected-error @+1 {{else branch result type tensor<3xf32> is incompatible with op result type tensor<2xf32>}} %1 = "tf.If"(%arg0, %arg1) { then_branch = @testIfThen, - else_branch = @testIfElse + else_branch = @testIfElse, + is_stateless = false } : (tensor, tensor<*xf32>) -> tensor<2xf32> return %1 : tensor<2xf32> diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.cc index 2ac09e3540d..0b9012d9df0 100644 --- a/tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.cc +++ b/tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.cc @@ -979,9 +979,12 @@ Status Importer::ConvertNode(const Node& node) { node_type_name = (*tf_name_to_mlir_name_)[node_type_name]; } - const char* kTfControlFlowFormPrefix = "_tf."; - std::string op_name = kTfControlFlowFormPrefix + node_type_name; + auto get_full_op_name = [&](const std::string& op_name) { + const char* kTfControlFlowFormPrefix = "_tf."; + return kTfControlFlowFormPrefix + op_name; + }; + std::string op_name = get_full_op_name(node_type_name); if (back_edge_node_output_.contains(&node)) { op_name = op_name + ".sink"; } @@ -1082,6 +1085,14 @@ Status Importer::ConvertNode(const Node& node) { result.attributes.push_back(builder_->getNamedAttr( "device", builder_->getStringAttr(std::string(node_def.device())))); + // Map If and StatelessIf op in TensorFlow to the common If op in MLIR and add + // the differentiating attribute. + if (node.IsIfNode()) { + result.name = mlir::OperationName(get_full_op_name("If"), context_); + mlir::BoolAttr val = builder_->getBoolAttr(node_type_name == "StatelessIf"); + result.attributes.push_back(builder_->getNamedAttr("is_stateless", val)); + } + node_values_[node.id()] = builder_->createOperation(result); return Status::OK(); } diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc index a2f803c0858..7befa9ac28e 100644 --- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc +++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc @@ -160,6 +160,18 @@ Status ConvertAttribute(const mlir::ArrayAttr& attr, AttrValue* value) { return Status::OK(); } +// Updates NodeDef constructed out of an MLIR If op to map it to either +// TensorFlow StatelessIf or If op depending on the additional attribute. +void UpdateCompositeIfOp(NodeDef* node_def) { + auto it = node_def->mutable_attr()->find("is_stateless"); + if (it != node_def->attr().end()) { + if (it->second.b()) { + *node_def->mutable_op() = "StatelessIf"; + } + node_def->mutable_attr()->erase(it); + } +} + } // anonymous namespace StatusOr> GetOperationNodeDef( @@ -194,6 +206,8 @@ StatusOr> GetOperationNodeDef( TF_RETURN_IF_ERROR(ConvertLocation( inst->getLoc(), node_def->mutable_experimental_debug_info())); + if (node_def->op() == "If") UpdateCompositeIfOp(node_def.get()); + return node_def; } From 8281648f9c00f8ea760bbc6a3770dfcd4503f0eb Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Mon, 22 Jul 2019 22:46:03 -0700 Subject: [PATCH 0376/3053] Add prelu op for micro PiperOrigin-RevId: 259473219 --- .../lite/experimental/micro/kernels/BUILD | 15 ++ .../micro/kernels/all_ops_resolver.cc | 2 + .../lite/experimental/micro/kernels/prelu.cc | 114 ++++++++++ .../experimental/micro/kernels/prelu_test.cc | 204 ++++++++++++++++++ .../experimental/micro/tools/make/Makefile | 1 + tensorflow/lite/kernels/internal/BUILD | 2 + .../lite/kernels/internal/reference/prelu.h | 77 +++++++ .../internal/reference/reference_ops.h | 48 +---- 8 files changed, 416 insertions(+), 47 deletions(-) create mode 100644 tensorflow/lite/experimental/micro/kernels/prelu.cc create mode 100644 tensorflow/lite/experimental/micro/kernels/prelu_test.cc create mode 100644 tensorflow/lite/kernels/internal/reference/prelu.h diff --git a/tensorflow/lite/experimental/micro/kernels/BUILD b/tensorflow/lite/experimental/micro/kernels/BUILD index 43288c9de60..5121bc3d15b 100644 --- a/tensorflow/lite/experimental/micro/kernels/BUILD +++ b/tensorflow/lite/experimental/micro/kernels/BUILD @@ -19,6 +19,7 @@ cc_library( "elementwise.cc", "fully_connected.cc", "pooling.cc", + "prelu.cc", "softmax.cc", ], hdrs = [ @@ -59,6 +60,7 @@ cc_library( "fully_connected.cc", "pooling.cc", "portable_optimized/depthwise_conv.cc", + "prelu.cc", "softmax.cc", ], hdrs = [ @@ -179,3 +181,16 @@ tflite_micro_cc_test( "//tensorflow/lite/experimental/micro/testing:micro_test", ], ) + +tflite_micro_cc_test( + name = "prelu_test", + srcs = [ + "prelu_test.cc", + ], + deps = [ + ":all_ops_resolver", + "//tensorflow/lite/c:c_api_internal", + "//tensorflow/lite/experimental/micro:micro_framework", + "//tensorflow/lite/experimental/micro/testing:micro_test", + ], +) diff --git a/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc b/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc index 6fb2e664802..c54cdf78f6c 100644 --- a/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc +++ b/tensorflow/lite/experimental/micro/kernels/all_ops_resolver.cc @@ -23,6 +23,7 @@ TfLiteRegistration* Register_CONV_2D(); TfLiteRegistration* Register_AVERAGE_POOL_2D(); TfLiteRegistration* Register_MAX_POOL_2D(); TfLiteRegistration* Register_ABS(); +TfLiteRegistration* Register_PRELU(); AllOpsResolver::AllOpsResolver() { AddBuiltin(BuiltinOperator_DEPTHWISE_CONV_2D, Register_DEPTHWISE_CONV_2D()); @@ -34,6 +35,7 @@ AllOpsResolver::AllOpsResolver() { AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D()); AddBuiltin(BuiltinOperator_AVERAGE_POOL_2D, Register_AVERAGE_POOL_2D()); AddBuiltin(BuiltinOperator_ABS, Register_ABS()); + AddBuiltin(BuiltinOperator_PRELU, Register_PRELU()); } } // namespace micro diff --git a/tensorflow/lite/experimental/micro/kernels/prelu.cc b/tensorflow/lite/experimental/micro/kernels/prelu.cc new file mode 100644 index 00000000000..bfa5b9a0e75 --- /dev/null +++ b/tensorflow/lite/experimental/micro/kernels/prelu.cc @@ -0,0 +1,114 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/kernels/internal/reference/prelu.h" + +#include "tensorflow/lite/c/c_api_internal.h" +#include "tensorflow/lite/kernels/internal/quantization_util.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" +#include "tensorflow/lite/kernels/kernel_util.h" + +namespace tflite { +namespace ops { +namespace micro { +namespace activations { + +TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) { + return kTfLiteOk; +} + +inline void BroadcastPrelu4DSlowFloat( + const RuntimeShape& unextended_input1_shape, const float* input1_data, + const RuntimeShape& unextended_input2_shape, const float* input2_data, + const RuntimeShape& unextended_output_shape, float* output_data) { + TFLITE_DCHECK_LE(unextended_input1_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_input2_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4); + const RuntimeShape output_shape = + RuntimeShape::ExtendedShape(4, unextended_output_shape); + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(unextended_input1_shape, + unextended_input2_shape, &desc1, &desc2); + + for (int b = 0; b < output_shape.Dims(0); ++b) { + for (int y = 0; y < output_shape.Dims(1); ++y) { + for (int x = 0; x < output_shape.Dims(2); ++x) { + for (int c = 0; c < output_shape.Dims(3); ++c) { + auto out_idx = Offset(output_shape, b, y, x, c); + auto in1_idx = SubscriptToIndex(desc1, b, y, x, c); + auto in2_idx = SubscriptToIndex(desc2, b, y, x, c); + auto in1_val = input1_data[in1_idx]; + auto in2_val = input2_data[in2_idx]; + output_data[out_idx] = in1_val >= 0.0 ? in1_val : in1_val * in2_val; + } + } + } + } +} + +TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) { + const TfLiteTensor* input = GetInput(context, node, 0); + const TfLiteTensor* alpha = GetInput(context, node, 1); + TfLiteTensor* output = GetOutput(context, node, 0); + int32_t output_multiplier = 0; + int output_shift = 0; + if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) { + double real_multiplier = + input->params.scale * alpha->params.scale / output->params.scale; + QuantizeMultiplierSmallerThanOneExp(real_multiplier, &output_multiplier, + &output_shift); + } + switch (input->type) { + case kTfLiteFloat32: { + BroadcastPrelu4DSlowFloat( + GetTensorShape(input), GetTensorData(input), + GetTensorShape(alpha), GetTensorData(alpha), + GetTensorShape(output), GetTensorData(output)); + return kTfLiteOk; + } break; + case kTfLiteUInt8: { + PreluParams op_params; + op_params.input_offset = -input->params.zero_point; + op_params.alpha_offset = -alpha->params.zero_point; + op_params.output_offset = output->params.zero_point; + op_params.output_multiplier = output_multiplier; + op_params.output_shift = output_shift; + reference_ops::BroadcastPrelu4DSlow( + op_params, GetTensorShape(input), GetTensorData(input), + GetTensorShape(alpha), GetTensorData(alpha), + GetTensorShape(output), GetTensorData(output)); + return kTfLiteOk; + } break; + default: + context->ReportError( + context, "Only float32 and uint8 are supported currently, got %d.", + TfLiteTypeGetName(input->type)); + return kTfLiteError; + } +} + +} // namespace activations + +TfLiteRegistration* Register_PRELU() { + static TfLiteRegistration r = {nullptr, nullptr, activations::PreluPrepare, + activations::PreluEval}; + return &r; +} + +} // namespace micro +} // namespace ops +} // namespace tflite diff --git a/tensorflow/lite/experimental/micro/kernels/prelu_test.cc b/tensorflow/lite/experimental/micro/kernels/prelu_test.cc new file mode 100644 index 00000000000..583b43ba189 --- /dev/null +++ b/tensorflow/lite/experimental/micro/kernels/prelu_test.cc @@ -0,0 +1,204 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/c/builtin_op_data.h" +#include "tensorflow/lite/c/c_api_internal.h" +#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h" +#include "tensorflow/lite/experimental/micro/simple_tensor_allocator.h" +#include "tensorflow/lite/experimental/micro/testing/micro_test.h" +#include "tensorflow/lite/experimental/micro/testing/test_utils.h" + +namespace tflite { +namespace testing { +namespace { + +void TestPreluFloat(std::initializer_list input_dims_data, + std::initializer_list input_data, + std::initializer_list alpha_dims_data, + std::initializer_list alpha_data, + std::initializer_list expected_output_data, + std::initializer_list output_dims_data, + float* output_data) { + TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data); + TfLiteIntArray* alpha_dims = IntArrayFromInitializer(alpha_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data); + const int output_dims_count = ElementCount(*output_dims); + constexpr int inputs_size = 2; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + CreateFloatTensor(input_data, input_dims, "input_tensor"), + CreateFloatTensor(alpha_data, alpha_dims, "alpha_tensor"), + CreateFloatTensor(output_data, output_dims, "output_tensor"), + }; + TfLiteContext context; + PopulateContext(tensors, tensors_size, &context); + ::tflite::ops::micro::AllOpsResolver resolver; + const TfLiteRegistration* registration = + resolver.FindOp(tflite::BuiltinOperator_PRELU, 1); + TF_LITE_MICRO_EXPECT_NE(nullptr, registration); + + size_t init_data_size = 0; + void* user_data = nullptr; + if (registration->init) { + user_data = registration->init(&context, nullptr, init_data_size); + } + int inputs_array_data[] = {2, 0, 1}; + TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); + int outputs_array_data[] = {1, 2}; + TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); + TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0}); + TfLiteNode node; + node.inputs = inputs_array; + node.outputs = outputs_array; + node.temporaries = temporaries_array; + node.user_data = user_data; + node.builtin_data = nullptr; + node.custom_initial_data = nullptr; + node.custom_initial_data_size = 0; + node.delegate = nullptr; + if (registration->prepare) { + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node)); + } + TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke); + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node)); + if (registration->free) { + registration->free(&context, user_data); + } + for (int i = 0; i < output_dims_count; ++i) { + TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i], + 1e-5f); + } +} + +void TestPreluQuantized(std::initializer_list input_dims_data, + std::initializer_list input_data, + float input_min, float input_max, + std::initializer_list alpha_dims_data, + std::initializer_list alpha_data, + float alpha_min, float alpha_max, + std::initializer_list expected_output_data, + std::initializer_list output_dims_data, + float output_min, float output_max, + uint8_t* output_data) { + TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data); + TfLiteIntArray* alpha_dims = IntArrayFromInitializer(alpha_dims_data); + TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data); + const int output_dims_count = ElementCount(*output_dims); + constexpr int inputs_size = 2; + constexpr int outputs_size = 1; + constexpr int tensors_size = inputs_size + outputs_size; + TfLiteTensor tensors[tensors_size] = { + CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min, + input_max), + CreateQuantizedTensor(alpha_data, alpha_dims, "alpha_tensor", alpha_min, + alpha_max), + CreateQuantizedTensor(output_data, output_dims, "output_tensor", + output_min, output_max), + }; + TfLiteContext context; + PopulateContext(tensors, tensors_size, &context); + ::tflite::ops::micro::AllOpsResolver resolver; + const TfLiteRegistration* registration = + resolver.FindOp(tflite::BuiltinOperator_PRELU, 1); + TF_LITE_MICRO_EXPECT_NE(nullptr, registration); + + size_t init_data_size = 0; + void* user_data = nullptr; + if (registration->init) { + user_data = registration->init(&context, nullptr, init_data_size); + } + int inputs_array_data[] = {2, 0, 1}; + TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data); + int outputs_array_data[] = {1, 2}; + TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data); + TfLiteIntArray* temporaries_array = IntArrayFromInitializer({0}); + TfLiteNode node; + node.inputs = inputs_array; + node.outputs = outputs_array; + node.temporaries = temporaries_array; + node.user_data = user_data; + node.builtin_data = nullptr; + node.custom_initial_data = nullptr; + node.custom_initial_data_size = 0; + node.delegate = nullptr; + if (registration->prepare) { + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node)); + } + TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke); + TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node)); + if (registration->free) { + registration->free(&context, user_data); + } + for (int i = 0; i < output_dims_count; ++i) { + TF_LITE_MICRO_EXPECT_EQ(expected_output_data.begin()[i], output_data[i]); + } +} +} // namespace +} // namespace testing +} // namespace tflite + +TF_LITE_MICRO_TESTS_BEGIN + +TF_LITE_MICRO_TEST(FloatPreluActivationsOpTest) { + const int output_dims_count = 12; + float output_data[output_dims_count]; + tflite::testing::TestPreluFloat({1, 2, 2, 3}, // input shape + { + 0.0f, 0.0f, 0.0f, // Row 1, Column 1 + 1.0f, 1.0f, 1.0f, // Row 1, Column 2 + -1.0f, -1.0f, -1.0f, // Row 2, Column 1 + -2.0f, -2.0f, -2.0f, // Row 1, Column 2 + }, + {1, 1, 3}, // alpha shape + {0.0f, 1.0f, 2.0f}, // alpha values + { + 0.0f, 0.0f, 0.0f, // Row 1, Column 1 + 1.0f, 1.0f, 1.0f, // Row 1, Column 2 + 0.0f, -1.0f, -2.0f, // Row 2, Column 1 + 0.0f, -2.0f, -4.0f, // Row 1, Column 2 + }, + {1, 2, 2, 3}, // output shape + output_data); +} + +TF_LITE_MICRO_TEST(QuantizedPreluActivationsOpTest) { + using tflite::testing::F2Q; + const float kMin = -1; + const float kMax = 127.f / 128.f; + const float kAlphaMin = -0.5f; + const float kAlphaMax = 0.5f; + const int output_dims_count = 12; + uint8_t output_data[output_dims_count]; + tflite::testing::TestPreluQuantized( + {1, 2, 2, 3}, // input shape + {F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), + F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax), + F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax), + F2Q(-0.25f, kMin, kMax), F2Q(-0.25f, kMin, kMax), + F2Q(-0.25f, kMin, kMax)}, + kMin, kMax, {1, 1, 3}, // alpha shape + {F2Q(0.0f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(-0.5f, kMin, kMax)}, + kMin, kMax, + {F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), + F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax), + F2Q(0.0f, kMin, kMax), F2Q(-0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax), + F2Q(0.0f, kMin, kMax), F2Q(-0.125f, kMin, kMax), + F2Q(0.125f, kMin, kMax)}, + {1, 2, 2, 3}, // output shape + kMin, kMax, output_data); +} + +TF_LITE_MICRO_TESTS_END diff --git a/tensorflow/lite/experimental/micro/tools/make/Makefile b/tensorflow/lite/experimental/micro/tools/make/Makefile index 67a3ea97db6..f3828928612 100644 --- a/tensorflow/lite/experimental/micro/tools/make/Makefile +++ b/tensorflow/lite/experimental/micro/tools/make/Makefile @@ -112,6 +112,7 @@ tensorflow/lite/kernels/internal/reference/depthwiseconv_float.h \ tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h \ tensorflow/lite/kernels/internal/reference/fully_connected.h \ tensorflow/lite/kernels/internal/reference/pooling.h \ +tensorflow/lite/kernels/internal/reference/prelu.h \ tensorflow/lite/kernels/internal/reference/softmax.h \ tensorflow/lite/kernels/internal/round.h \ tensorflow/lite/kernels/internal/tensor_ctypes.h \ diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD index a4cbd0f3271..199909ccbf8 100644 --- a/tensorflow/lite/kernels/internal/BUILD +++ b/tensorflow/lite/kernels/internal/BUILD @@ -365,6 +365,7 @@ cc_library( "reference/integer_ops/softmax.h", "reference/integer_ops/tanh.h", "reference/pooling.h", + "reference/prelu.h", "reference/reference_ops.h", "reference/softmax.h", "reference/strided_slice.h", @@ -405,6 +406,7 @@ cc_library( "reference/fully_connected.h", "reference/legacy_reference_ops.h", "reference/pooling.h", + "reference/prelu.h", "reference/reference_ops.h", "reference/softmax.h", "reference/strided_slice.h", diff --git a/tensorflow/lite/kernels/internal/reference/prelu.h b/tensorflow/lite/kernels/internal/reference/prelu.h new file mode 100644 index 00000000000..adbbf66eb1b --- /dev/null +++ b/tensorflow/lite/kernels/internal/reference/prelu.h @@ -0,0 +1,77 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_ +#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_ + +#include "tensorflow/lite/kernels/internal/common.h" +#include "tensorflow/lite/kernels/internal/compatibility.h" +#include "tensorflow/lite/kernels/internal/types.h" + +namespace tflite { + +namespace reference_ops { + +// Broadcast prelu to output_shape for quantized uint8 data. +inline void BroadcastPrelu4DSlow(const PreluParams& params, + const RuntimeShape& input_shape, + const uint8* input_data, + const RuntimeShape& alpha_shape, + const uint8* alpha_data, + const RuntimeShape& output_shape, + uint8* output_data) { + TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4); + TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4); + const RuntimeShape extended_output_shape = + RuntimeShape::ExtendedShape(4, output_shape); + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2); + + for (int b = 0; b < extended_output_shape.Dims(0); ++b) { + for (int y = 0; y < extended_output_shape.Dims(1); ++y) { + for (int x = 0; x < extended_output_shape.Dims(2); ++x) { + for (int c = 0; c < extended_output_shape.Dims(3); ++c) { + int output_index = Offset(extended_output_shape, b, y, x, c); + int input_index = SubscriptToIndex(desc1, b, y, x, c); + const int32 input_value = + params.input_offset + input_data[input_index]; + if (input_value >= 0) { + output_data[output_index] = input_data[input_index]; + } else { + auto alpha_index = SubscriptToIndex(desc2, b, y, x, c); + const int32 alpha_value = + params.alpha_offset + alpha_data[alpha_index]; + const int32 unclamped_output = + params.output_offset + + MultiplyByQuantizedMultiplierSmallerThanOneExp( + input_value * alpha_value, params.output_multiplier, + params.output_shift); + const int32 quantized_min = std::numeric_limits::min(); + const int32 quantized_max = std::numeric_limits::max(); + const int32 clamped_output = std::min( + quantized_max, std::max(quantized_min, unclamped_output)); + output_data[output_index] = static_cast(clamped_output); + } + } + } + } + } +} + +} // namespace reference_ops +} // namespace tflite + +#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_PRELU_H_ diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h index a8b35ae7b92..92b3b47fb04 100644 --- a/tensorflow/lite/kernels/internal/reference/reference_ops.h +++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h @@ -35,6 +35,7 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/reference/conv.h" #include "tensorflow/lite/kernels/internal/reference/fully_connected.h" #include "tensorflow/lite/kernels/internal/reference/pooling.h" +#include "tensorflow/lite/kernels/internal/reference/prelu.h" #include "tensorflow/lite/kernels/internal/reference/softmax.h" #include "tensorflow/lite/kernels/internal/reference/strided_slice.h" #include "tensorflow/lite/kernels/internal/round.h" @@ -4403,53 +4404,6 @@ inline void ResizeNearestNeighbor( } } -inline void BroadcastPrelu4DSlow(const PreluParams& params, - const RuntimeShape& input_shape, - const uint8* input_data, - const RuntimeShape& alpha_shape, - const uint8* alpha_data, - const RuntimeShape& output_shape, - uint8* output_data) { - TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4); - TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4); - TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4); - const RuntimeShape extended_output_shape = - RuntimeShape::ExtendedShape(4, output_shape); - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2); - - for (int b = 0; b < extended_output_shape.Dims(0); ++b) { - for (int y = 0; y < extended_output_shape.Dims(1); ++y) { - for (int x = 0; x < extended_output_shape.Dims(2); ++x) { - for (int c = 0; c < extended_output_shape.Dims(3); ++c) { - int output_index = Offset(extended_output_shape, b, y, x, c); - int input_index = SubscriptToIndex(desc1, b, y, x, c); - const int32 input_value = - params.input_offset + input_data[input_index]; - if (input_value >= 0) { - output_data[output_index] = input_data[input_index]; - } else { - auto alpha_index = SubscriptToIndex(desc2, b, y, x, c); - const int32 alpha_value = - params.alpha_offset + alpha_data[alpha_index]; - const int32 unclamped_output = - params.output_offset + - MultiplyByQuantizedMultiplierSmallerThanOneExp( - input_value * alpha_value, params.output_multiplier, - params.output_shift); - const int32 quantized_min = std::numeric_limits::min(); - const int32 quantized_max = std::numeric_limits::max(); - const int32 clamped_output = std::min( - quantized_max, std::max(quantized_min, unclamped_output)); - output_data[output_index] = static_cast(clamped_output); - } - } - } - } - } -} - template void Fill(const RuntimeShape& value_shape, const T* value_data, const RuntimeShape& output_shape, T* output_data) { From 698a70a9eb55cb9444cd25070d6cd2ccab0db44c Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 02:02:24 -0700 Subject: [PATCH 0377/3053] Update GraphDef version to 105. PiperOrigin-RevId: 259494517 --- tensorflow/core/public/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h index a01653124b2..94d81942cb8 100644 --- a/tensorflow/core/public/version.h +++ b/tensorflow/core/public/version.h @@ -108,7 +108,7 @@ limitations under the License. #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0 -#define TF_GRAPH_DEF_VERSION 104 // Updated: 2019/7/22 +#define TF_GRAPH_DEF_VERSION 105 // Updated: 2019/7/23 // Checkpoint compatibility versions (the versions field in SavedSliceMeta). // From 2522738eba2be57e524b48ad40c15f88f5464b3f Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 02:02:24 -0700 Subject: [PATCH 0378/3053] compat: Update forward compatibility horizon to 2019-07-23 PiperOrigin-RevId: 259494518 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index bb236f1142e..493f7266b20 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export # This value changes every day with an automatic CL. It can be modified in code # via `forward_compatibility_horizon()` or with the environment variable # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date. -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 7, 22) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 7, 23) _FORWARD_COMPATIBILITY_HORIZON_OVERRIDDEN = False _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS" From fd635616f65b492e9c441be5dca6427b1531955a Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 06:17:54 -0700 Subject: [PATCH 0379/3053] [XLA:Python] Make PythonRefManager static global to remove all Python deps from local_client. PiperOrigin-RevId: 259523231 --- tensorflow/compiler/xla/python/BUILD | 36 ++++++++++--------- .../compiler/xla/python/local_client.cc | 1 - tensorflow/compiler/xla/python/local_client.h | 13 ------- .../compiler/xla/python/python_ref_manager.cc | 5 +++ .../compiler/xla/python/python_ref_manager.h | 5 +++ tensorflow/compiler/xla/python/xla.cc | 15 ++++---- 6 files changed, 38 insertions(+), 37 deletions(-) diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD index fbcaa6f9fc3..b2877670223 100644 --- a/tensorflow/compiler/xla/python/BUILD +++ b/tensorflow/compiler/xla/python/BUILD @@ -188,24 +188,11 @@ cc_library( cc_library( name = "local_client", - srcs = [ - "local_client.cc", - "python_ref_manager.cc", - "python_ref_manager.h", - ], - hdrs = [ - "local_client.h", - ], - copts = [ - "-fexceptions", - "-fno-strict-aliasing", - "-Wno-c++98-c++11-compat", - ], - features = ["-use_header_modules"], + srcs = ["local_client.cc"], + hdrs = ["local_client.h"], deps = [ ":device", ":shared_device_buffer", - ":types", "//tensorflow/compiler/xla:executable_run_options", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", @@ -227,12 +214,28 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core/profiler/lib:traceme", "//tensorflow/stream_executor:tf_allocator_adapter", - "@com_google_absl//absl/container:inlined_vector", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/synchronization", "@com_google_absl//absl/time", "@com_google_absl//absl/types:span", + ], +) + +cc_library( + name = "python_ref_manager", + srcs = ["python_ref_manager.cc"], + hdrs = ["python_ref_manager.h"], + copts = [ + "-fexceptions", + "-fno-strict-aliasing", + "-Wno-c++98-c++11-compat", + ], + features = ["-use_header_modules"], + deps = [ + "@com_google_absl//absl/container:inlined_vector", + "@com_google_absl//absl/synchronization", + "@com_google_absl//absl/types:span", "@pybind11", ], ) @@ -252,6 +255,7 @@ tf_pybind_extension( deps = [ ":local_client", ":shared_device_buffer", + ":python_ref_manager", ":types", ":xrt", "@com_google_absl//absl/base", diff --git a/tensorflow/compiler/xla/python/local_client.cc b/tensorflow/compiler/xla/python/local_client.cc index b6d44ef011e..e985d6ff5c6 100644 --- a/tensorflow/compiler/xla/python/local_client.cc +++ b/tensorflow/compiler/xla/python/local_client.cc @@ -91,7 +91,6 @@ limitations under the License. #include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/literal_util.h" #include "tensorflow/compiler/xla/python/shared_device_buffer.h" -#include "tensorflow/compiler/xla/python/types.h" #include "tensorflow/compiler/xla/service/platform_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/util.h" diff --git a/tensorflow/compiler/xla/python/local_client.h b/tensorflow/compiler/xla/python/local_client.h index 65e3203a258..7496d5352d4 100644 --- a/tensorflow/compiler/xla/python/local_client.h +++ b/tensorflow/compiler/xla/python/local_client.h @@ -27,7 +27,6 @@ limitations under the License. #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/python/device.h" -#include "tensorflow/compiler/xla/python/python_ref_manager.h" #include "tensorflow/compiler/xla/python/shared_device_buffer.h" #include "tensorflow/compiler/xla/service/computation_placer.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" @@ -93,18 +92,10 @@ class PyLocalClient { return &h2d_transfer_pool_; } - PythonRefManager& py_ref_manager() { return py_ref_manager_; } - protected: std::string platform_name_; LocalClient* client_; - // py_ref_manager_ must come after devices_ in the class destruction order - // (i.e., appear first in the class.) - // Destruction of devices waits for them to quiesce; callbacks on device - // streams may refer to py_ref_manager_ and we must wait for them to complete. - PythonRefManager py_ref_manager_; - std::vector> devices_; se::DeviceMemoryAllocator* allocator_; std::unique_ptr owned_allocator_; @@ -148,10 +139,6 @@ class PyLocalBuffer { const Shape& on_host_shape() const { return on_host_shape_; } int device_ordinal() const { return device_ordinal_; } - // TODO(makro): Make `client` private once `PythonRefManager` is refactored - // out of `PyLocalClient`. - PyLocalClient* client() const { return client_.get(); } - // Returns the buffer's value as a tuple DAG of Python arrays. If the value // has previously been prefetched to the host, then returns the prefetched // version, otherwise copies the buffer to the host. Blocks until the diff --git a/tensorflow/compiler/xla/python/python_ref_manager.cc b/tensorflow/compiler/xla/python/python_ref_manager.cc index 1e9cc58d090..0a980f1a749 100644 --- a/tensorflow/compiler/xla/python/python_ref_manager.cc +++ b/tensorflow/compiler/xla/python/python_ref_manager.cc @@ -49,4 +49,9 @@ void PythonRefManager::CollectGarbage() { python_garbage_.clear(); } +PythonRefManager* GlobalPyRefManager() { + static PythonRefManager* static_ref_manager = new PythonRefManager(); + return static_ref_manager; +} + } // namespace xla diff --git a/tensorflow/compiler/xla/python/python_ref_manager.h b/tensorflow/compiler/xla/python/python_ref_manager.h index 8be19336a89..054150faf25 100644 --- a/tensorflow/compiler/xla/python/python_ref_manager.h +++ b/tensorflow/compiler/xla/python/python_ref_manager.h @@ -74,6 +74,11 @@ class PythonRefManager { std::deque python_garbage_ GUARDED_BY(mu_); }; +// A global PythonRefManager. Unless `CollectGarbage()` is called before +// shutdown, this container will hold on to Python objects and thus cause a +// leak. This behavior is similar to `tensorflow::ClearDecRefCache()`. +PythonRefManager* GlobalPyRefManager(); + } // namespace xla #endif // TENSORFLOW_COMPILER_XLA_PYTHON_PYTHON_REF_MANAGER_H_ diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc index 6cd56b800a2..d8a4aaa4650 100644 --- a/tensorflow/compiler/xla/python/xla.cc +++ b/tensorflow/compiler/xla/python/xla.cc @@ -35,6 +35,7 @@ limitations under the License. #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/python/local_client.h" +#include "tensorflow/compiler/xla/python/python_ref_manager.h" #include "tensorflow/compiler/xla/python/types.h" #include "tensorflow/compiler/xla/python/xrt.h" #include "tensorflow/compiler/xla/service/custom_call_target_registry.h" @@ -315,14 +316,14 @@ PYBIND11_MODULE(xla_extension, m) { .def("TransferToInfeed", [](PyLocalClient* client, const LiteralSlice& literal, int device_ordinal) { - client->py_ref_manager().CollectGarbage(); + GlobalPyRefManager()->CollectGarbage(); py::gil_scoped_release gil_release; return client->TransferToInfeed(literal, device_ordinal); }) .def("TransferFromOutfeed", [](PyLocalClient* client, const Shape& shape, int device_ordinal) -> StatusOr { - client->py_ref_manager().CollectGarbage(); + GlobalPyRefManager()->CollectGarbage(); std::shared_ptr literal_shared; { py::gil_scoped_release gil_release; @@ -339,11 +340,11 @@ PYBIND11_MODULE(xla_extension, m) { [](const pybind11::object& argument, std::shared_ptr client, int device_ordinal) -> StatusOr> { - client->py_ref_manager().CollectGarbage(); + GlobalPyRefManager()->CollectGarbage(); TF_ASSIGN_OR_RETURN(PythonBufferTree tree, GetPythonBufferTree(argument)); std::shared_ptr py_buffer_ref = - client->py_ref_manager().ManageReferences( + GlobalPyRefManager()->ManageReferences( absl::MakeSpan(tree.arrays)); tree.arrays.clear(); @@ -360,7 +361,7 @@ PYBIND11_MODULE(xla_extension, m) { .def_static("make_tuple", &PyLocalBuffer::MakeTuple) .def("copy_to_device", [](PyLocalBuffer* buffer, int dst_device_ordinal) { - buffer->client()->py_ref_manager().CollectGarbage(); + GlobalPyRefManager()->CollectGarbage(); py::gil_scoped_release gil_release; return buffer->CopyToDevice(dst_device_ordinal); }) @@ -368,14 +369,14 @@ PYBIND11_MODULE(xla_extension, m) { .def("destructure", &PyLocalBuffer::DestructureTuple) .def("block_host_until_ready", [](PyLocalBuffer* buffer) { - buffer->client()->py_ref_manager().CollectGarbage(); + GlobalPyRefManager()->CollectGarbage(); py::gil_scoped_release gil_release; return buffer->BlockHostUntilReady(); }) .def("copy_to_host_async", &PyLocalBuffer::CopyToHostAsync) .def("to_py", [](PyLocalBuffer* buffer) -> StatusOr { - buffer->client()->py_ref_manager().CollectGarbage(); + GlobalPyRefManager()->CollectGarbage(); std::shared_ptr literal; { py::gil_scoped_release gil_release; From 3a7b36bca7f43ce4f0d0791ce0e0d84ece8683d9 Mon Sep 17 00:00:00 2001 From: Andy Ly Date: Tue, 23 Jul 2019 08:07:47 -0700 Subject: [PATCH 0380/3053] [Grappler] Remove DCHECK from a MutableGraphView CanDedupControlWithRegularInput check. PiperOrigin-RevId: 259537618 --- tensorflow/core/grappler/mutable_graph_view.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/grappler/mutable_graph_view.cc b/tensorflow/core/grappler/mutable_graph_view.cc index 1200cff7127..6b6cc8d49da 100644 --- a/tensorflow/core/grappler/mutable_graph_view.cc +++ b/tensorflow/core/grappler/mutable_graph_view.cc @@ -89,8 +89,9 @@ bool CanDedupControlWithRegularInput(const MutableGraphView& graph, bool CanDedupControlWithRegularInput(const MutableGraphView& graph, absl::string_view control_node_name) { NodeDef* control_node = graph.GetNode(control_node_name); - DCHECK(control_node != nullptr) - << "Didn't find a node for control dependency: " << control_node_name; + if (control_node == nullptr) { + return false; + } return CanDedupControlWithRegularInput(graph, *control_node); } From 12846d752c1474201cef985639f78e56e2081da6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 08:17:42 -0700 Subject: [PATCH 0381/3053] Fixed input dtype for `preprocessing_normalization_test.test_layer_computation`. PiperOrigin-RevId: 259538951 --- .../layers/preprocessing/normalization.py | 1 + .../preprocessing/normalization_test.py | 38 +++++++++++-------- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/tensorflow/python/keras/layers/preprocessing/normalization.py b/tensorflow/python/keras/layers/preprocessing/normalization.py index 657a0f9ad51..1dc109c36ab 100644 --- a/tensorflow/python/keras/layers/preprocessing/normalization.py +++ b/tensorflow/python/keras/layers/preprocessing/normalization.py @@ -83,6 +83,7 @@ class Normalization(CombinerPreprocessingLayer): # count is not used in this class's call() method, but is used to re-create # the accumulator during multiple calls to 'adapt'. + # TODO(omalleyt): should mean and variance be set to self.dtype? self.mean = self._add_state_variable( name=_MEAN_NAME, shape=mean_and_var_shape, diff --git a/tensorflow/python/keras/layers/preprocessing/normalization_test.py b/tensorflow/python/keras/layers/preprocessing/normalization_test.py index aff307cf6da..7167c43439f 100644 --- a/tensorflow/python/keras/layers/preprocessing/normalization_test.py +++ b/tensorflow/python/keras/layers/preprocessing/normalization_test.py @@ -129,36 +129,39 @@ class NormalizationTest(keras_parameterized.TestCase, @parameterized.named_parameters( { - "adapt_data": np.array([[1], [2], [3], [4], [5]]), + "adapt_data": np.array([[1.], [2.], [3.], [4.], [5.]]), "axis": -1, - "test_data": np.array([[1], [2], [3]]), + "test_data": np.array([[1.], [2.], [3.]]), "expected": np.array([[-1], [-.5], [0]]), "testcase_name": "2d_single_element" }, { "adapt_data": - np.array([[[1, 2, 3], [2, 3, 4]], [[3, 4, 5], [4, 5, 6]]]), + np.array([[[1., 2., 3.], [2., 3., 4.]], + [[3., 4., 5.], [4., 5., 6.]]]), "axis": 1, "test_data": - np.array([[[1, 2, 3], [2, 3, 4]], [[3, 4, 5], [4, 5, 6]]]), + np.array([[[1., 2., 3.], [2., 3., 4.]], + [[3., 4., 5.], [4., 5., 6.]]]), "expected": np.array([[[-1.2, -0.6, 0.], [-1.2, -0.6, 0.]], [[0., 0.6, 1.2], [0., 0.6, 1.2]]]), - "testcase_name": - "3d_internal_axis" + "testcase_name": "3d_internal_axis" }, { "adapt_data": - np.array([[[1, 0, 3], [2, 3, 4]], [[3, -1, 5], [4, 5, 8]]]), + np.array([[[1., 0., 3.], [2., 3., 4.]], + [[3., -1., 5.], [4., 5., 8.]]]), "axis": (1, 2), "test_data": - np.array([[[3, 1, -1], [2, 5, 4]], [[3, 0, 5], [2, 5, 8]]]), + np.array([[[3., 1., -1.], [2., 5., 4.]], + [[3., 0., 5.], [2., 5., 8.]]]), "expected": np.array([[[1., 6., -5.], [-1., 1., -0.5]], [[1., 2., 1.], [-1., 1., 0.5]]]), - "testcase_name": - "3d_multiple_axis" + "testcase_name": "3d_multiple_axis" }) def test_layer_computation(self, adapt_data, axis, test_data, expected): + cls = get_layer_class() layer = cls(axis=axis) layer.adapt(adapt_data) @@ -167,13 +170,16 @@ class NormalizationTest(keras_parameterized.TestCase, input_data = keras.Input(shape=input_shape) output = layer(input_data) model = keras.Model(input_data, output) - + model._run_eagerly = testing_utils.should_run_eagerly() + model._run_distributed = testing_utils.should_run_distributed() output_data = model.predict(test_data) self.assertAllClose(expected, output_data) - # 'assign' doesn't work in V1 mode, so don't test it in V1. - @keras_parameterized.run_all_keras_modes(always_skip_v1=True) def test_mean_setting_continued_adapt_failure(self): + + if not context.executing_eagerly(): + self.skipTest("'assign' doesn't work in V1, so don't test in V1.") + cls = get_layer_class() layer = cls() layer.build((2,)) @@ -181,9 +187,11 @@ class NormalizationTest(keras_parameterized.TestCase, with self.assertRaisesRegex(RuntimeError, "without also setting 'count'"): layer.adapt(np.array([[1, 2]]), reset_state=False) - # 'assign' doesn't work in V1 mode, so don't test it in V1. - @keras_parameterized.run_all_keras_modes(always_skip_v1=True) def test_var_setting_continued_adapt_failure(self): + + if not context.executing_eagerly(): + self.skipTest("'assign' doesn't work in V1, so don't test in V1.") + cls = get_layer_class() layer = cls() layer.build((2,)) From c65d182598b975e4ae27b70b25d26f46a55015fa Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Tue, 23 Jul 2019 17:44:14 +0200 Subject: [PATCH 0382/3053] Added dtype compatibility tests. Updated the stack of tests on GaussianNoise and GaussianDropout layers. This both highlights issue #30834 and the validity of the suggested fix (PR #30844). --- tensorflow/python/keras/layers/noise_test.py | 72 +++++++++++++++----- 1 file changed, 56 insertions(+), 16 deletions(-) diff --git a/tensorflow/python/keras/layers/noise_test.py b/tensorflow/python/keras/layers/noise_test.py index f1537a6919f..b860ff9ae55 100644 --- a/tensorflow/python/keras/layers/noise_test.py +++ b/tensorflow/python/keras/layers/noise_test.py @@ -18,6 +18,9 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import numpy as np + +from tensorflow.python import dtypes from tensorflow.python import keras from tensorflow.python.keras import keras_parameterized from tensorflow.python.keras import testing_utils @@ -27,24 +30,61 @@ from tensorflow.python.platform import test @keras_parameterized.run_all_keras_modes class NoiseLayersTest(keras_parameterized.TestCase): - def test_GaussianNoise(self): - testing_utils.layer_test( - keras.layers.GaussianNoise, - kwargs={'stddev': 1.}, - input_shape=(3, 2, 3)) + def test_GaussianNoise(self): + testing_utils.layer_test( + keras.layers.GaussianNoise, + kwargs={'stddev': 1.}, + input_shape=(3, 2, 3) + ) - def test_GaussianDropout(self): - testing_utils.layer_test( - keras.layers.GaussianDropout, - kwargs={'rate': 0.5}, - input_shape=(3, 2, 3)) + def test_GaussianDropout(self): + testing_utils.layer_test( + keras.layers.GaussianDropout, + kwargs={'rate': 0.5}, + input_shape=(3, 2, 3) + ) - def test_AlphaDropout(self): - testing_utils.layer_test( - keras.layers.AlphaDropout, - kwargs={'rate': 0.2}, - input_shape=(3, 2, 3)) + def test_AlphaDropout(self): + testing_utils.layer_test( + keras.layers.AlphaDropout, + kwargs={'rate': 0.2}, + input_shape=(3, 2, 3) + ) + + @staticmethod + def _make_model(dtype, gtype): + assert dtype in (dtypes.float32, dtypes.float64) + assert gtype in ('noise', 'dropout') + model = keras.Sequential() + model.add(keras.layers.Dense(8, input_shape=(32,), dtype=dtype)) + if gtype == 'noise': + gaussian = keras.layers.GaussianNoise(0.0003) + else: + gaussian = keras.layers.GaussianDropout(0.1) + model.add(gaussian) + return model + + def _train_model(self, dtype, gtype): + model = self._make_model(dtype, gtype) + model.compile( + optimizer='sgd', + loss='mse', + run_eagerly=testing_utils.should_run_eagerly() + ) + model.train_on_batch(np.zeros((8, 32)), np.zeros((8, 8))) + + def test_noise_float32(self): + self._train_model(dtypes.float32, 'noise') + + def test_noise_float64(self): + self._train_model(dtypes.float64, 'noise') + + def test_dropout_float32(self): + self._train_model(dtypes.float32, 'dropout') + + def test_dropout_float64(self): + self._train_model(dtypes.float64, 'dropout') if __name__ == '__main__': - test.main() + test.main() From 1fefe05424bb18184a2f896ec39a1a61cd0f454c Mon Sep 17 00:00:00 2001 From: James Ring Date: Tue, 23 Jul 2019 09:34:08 -0700 Subject: [PATCH 0383/3053] remove tensorflow-android deployment from java release script tensorflow-android is deprecated in favor of TF Lite and will not see any new releases. This change also adds the ability to deploy tensorflow artifacts to the local maven repository for testing. To use this: DEPLOY_OSSRH=false DEPLOY_BINTRAY=false DEPLOY_LOCAL=true ./release.sh PiperOrigin-RevId: 259552290 --- tensorflow/java/maven/release.sh | 1 + tensorflow/java/maven/run_inside_container.sh | 41 ++---- .../pom-android.xml.template | 27 ---- .../java/maven/tensorflow-android/update.py | 123 ------------------ 4 files changed, 14 insertions(+), 178 deletions(-) delete mode 100644 tensorflow/java/maven/tensorflow-android/pom-android.xml.template delete mode 100644 tensorflow/java/maven/tensorflow-android/update.py diff --git a/tensorflow/java/maven/release.sh b/tensorflow/java/maven/release.sh index 9012ea14ea6..269bbc916a0 100755 --- a/tensorflow/java/maven/release.sh +++ b/tensorflow/java/maven/release.sh @@ -51,6 +51,7 @@ docker run \ -e TF_VERSION="${TF_VERSION}" \ -e DEPLOY_OSSRH="${DEPLOY_OSSRH:-true}" \ -e DEPLOY_BINTRAY="${DEPLOY_BINTRAY:-true}" \ + -e DEPLOY_LOCAL="${DEPLOY_LOCAL:-false}" \ -v ${PWD}:/tensorflow \ -v "${SETTINGS_XML}":/root/.m2/settings.xml \ -v ${HOME}/.gnupg:/root/.gnupg \ diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh index 75c6cff5298..27ae193900f 100644 --- a/tensorflow/java/maven/run_inside_container.sh +++ b/tensorflow/java/maven/run_inside_container.sh @@ -25,10 +25,11 @@ TF_ECOSYSTEM_URL="https://github.com/tensorflow/ecosystem.git" # environment variables can be set to skip either repository. DEPLOY_BINTRAY="${DEPLOY_BINTRAY:-true}" DEPLOY_OSSRH="${DEPLOY_OSSRH:-true}" +DEPLOY_LOCAL="${DEPLOY_LOCAL:-false}" PROTOC_RELEASE_URL="https://github.com/google/protobuf/releases/download/v3.5.1/protoc-3.5.1-linux-x86_64.zip" -if [[ "${DEPLOY_BINTRAY}" != "true" && "${DEPLOY_OSSRH}" != "true" ]]; then - echo "Must deploy to at least one of Bintray or OSSRH" >&2 +if [[ "${DEPLOY_BINTRAY}" != "true" && "${DEPLOY_OSSRH}" != "true" && "${DEPLOY_LOCAL}" != "true" ]]; then + echo "Must deploy to at least one of Bintray, OSSRH or local" >&2 exit 2 fi @@ -40,7 +41,7 @@ clean() { # artifacts lying around) mvn -q clean rm -rf libtensorflow_jni/src libtensorflow_jni/target libtensorflow_jni_gpu/src libtensorflow_jni_gpu/target \ - libtensorflow/src libtensorflow/target tensorflow-android/target proto/src proto/target \ + libtensorflow/src libtensorflow/target proto/src proto/target \ tensorflow-hadoop/src tensorflow-hadoop/target spark-tensorflow-connector/src spark-tensorflow-connector/target } @@ -71,17 +72,6 @@ download_libtensorflow() { cd "${DIR}" } -# Fetch the android aar artifact from the CI build system, and update -# its associated pom file. -update_tensorflow_android() { - TARGET_DIR="${DIR}/tensorflow-android/target" - mkdir -p "${TARGET_DIR}" - python "${DIR}/tensorflow-android/update.py" \ - --version "${TF_VERSION}" \ - --template "${DIR}/tensorflow-android/pom-android.xml.template" \ - --dir "${TARGET_DIR}" -} - download_libtensorflow_jni() { NATIVE_DIR="${DIR}/libtensorflow_jni/src/main/resources/org/tensorflow/native" mkdir -p "${NATIVE_DIR}" @@ -211,19 +201,11 @@ download_tf_ecosystem() { # n/a deploy_profile() { local profile="$1" - # Deploy the non-android pieces. - mvn deploy -P"${profile}" - # Determine the correct pom file property to use - # for the repository url. - local rtype - rtype='repository' - local url=$(mvn_property "${profile}" "project.distributionManagement.${rtype}.url") - local repositoryId=$(mvn_property "${profile}" "project.distributionManagement.${rtype}.id") - mvn gpg:sign-and-deploy-file \ - -Dfile="${DIR}/tensorflow-android/target/tensorflow.aar" \ - -DpomFile="${DIR}/tensorflow-android/target/pom-android.xml" \ - -Durl="${url}" \ - -DrepositoryId="${repositoryId}" + if [[ ${profile} == "local" ]]; then + mvn install + else + mvn deploy -P"${profile}" + fi } # If successfully built, try to deploy. @@ -232,6 +214,10 @@ deploy_profile() { # ./release.sh ${TF_VERSION} ${SETTINGS_XML} bash # To get a shell to poke around the maven artifacts with. deploy_artifacts() { + # Deploy artifacts to local maven repository if requested + if [[ "${DEPLOY_LOCAL}" == "true" ]]; then + deploy_profile 'local' + fi # Deploy artifacts to ossrh if requested. if [[ "${DEPLOY_OSSRH}" == "true" ]]; then deploy_profile 'ossrh' @@ -264,7 +250,6 @@ update_version_in_pom download_libtensorflow download_libtensorflow_jni download_libtensorflow_jni_gpu -update_tensorflow_android generate_java_protos download_tf_ecosystem diff --git a/tensorflow/java/maven/tensorflow-android/pom-android.xml.template b/tensorflow/java/maven/tensorflow-android/pom-android.xml.template deleted file mode 100644 index 37d2372d7b0..00000000000 --- a/tensorflow/java/maven/tensorflow-android/pom-android.xml.template +++ /dev/null @@ -1,27 +0,0 @@ - - 4.0.0 - - org.tensorflow - tensorflow-android - ${version} - aar - - TensorFlow AAR for Android Inference Library and Java API - https://github.com/tensorflow/tensorflow/ - - org.tensorflow - parentpom - ${version} - ../ - - - - UTF-8 - ${build_commit_id} - ${build_type} - - - diff --git a/tensorflow/java/maven/tensorflow-android/update.py b/tensorflow/java/maven/tensorflow-android/update.py deleted file mode 100644 index c620564072c..00000000000 --- a/tensorflow/java/maven/tensorflow-android/update.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Fetch android artifacts and update pom properties.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import argparse -import json -import string -import sys -import urllib2 - - -def get_args(): - """Parse command line args.""" - parser = argparse.ArgumentParser() - parser.add_argument( - '--version', required=True, help='Version for the artifact.') - parser.add_argument( - '--dir', - required=True, - help='Directory where the pom and aar artifact will be written.') - parser.add_argument( - '--template', required=True, help='Path to pom template file.') - return parser.parse_args() - - -def get_json(url): - """Load the contents of the URL as a json object.""" - return json.load(urllib2.urlopen(url)) - - -def get_commit_id(build_info): - """Fetch the git commit id from the build info json object.""" - release_commit_id = build_info.get('build_commit_id') - if release_commit_id: - return release_commit_id - actions = build_info.get('actions') - build_data = next( - a for a in actions - if a.get('_class') == 'hudson.plugins.git.util.BuildData') - if not build_data: - raise ValueError('Missing BuildData: %s' % build_info) - revision_info = build_data.get('lastBuiltRevision') - if not revision_info: - raise ValueError('Missing lastBuiltRevision: %s' % build_info) - return revision_info.get('SHA1') - - -def get_aar_url(build_info): - """Given the json build info, find the URL to the tensorflow.aar artifact.""" - base_url = build_info.get('url') - if not base_url: - raise ValueError('Missing url: %s' % build_info) - build_class = build_info.get('_class') - if (build_class == 'hudson.model.FreeStyleBuild' or - build_class == 'hudson.matrix.MatrixRun'): - aar_info = next( - a for a in build_info.get('artifacts') - if a.get('fileName') == 'tensorflow.aar') - if not aar_info: - raise ValueError('Missing aar artifact: %s' % build_info) - return '%s/artifact/%s' % (base_url, aar_info.get('relativePath')) - - raise ValueError('Unknown build_type %s' % build_info) - - -def read_template(path): - with open(path) as f: - return string.Template(f.read()) - - -def main(): - args = get_args() - - release_prefix = 'https://storage.googleapis.com/tensorflow/libtensorflow' - info_url = '%s/android_buildinfo-%s.json' % (release_prefix, args.version) - aar_url = '%s/tensorflow-%s.aar' % (release_prefix, args.version) - build_type = 'release-android' - - # Retrieve build information - build_info = get_json(info_url) - - # Check all required build info is present - build_commit_id = get_commit_id(build_info) - if not build_commit_id: - raise ValueError('Missing commit id: %s' % build_info) - - # Write the pom file updated with build attributes. - template = read_template(args.template) - with open('%s/pom-android.xml' % args.dir, 'w') as f: - f.write( - template.substitute({ - 'build_commit_id': build_commit_id, - 'build_type': build_type, - 'version': args.version - })) - - # Retrieve the aar location if needed. - if not aar_url: - aar_url = get_aar_url(build_info) - - # And download the aar to the desired location. - with open('%s/tensorflow.aar' % args.dir, 'w') as f: - aar = urllib2.urlopen(aar_url) - f.write(aar.read()) - - -if __name__ == '__main__': - sys.exit(main()) From c015d55b6dfd7b1cc4296d54c00d95a56d5599ed Mon Sep 17 00:00:00 2001 From: Shining Sun Date: Tue, 23 Jul 2019 09:34:59 -0700 Subject: [PATCH 0384/3053] Move two tests from contrib to core. PiperOrigin-RevId: 259552437 --- .../collective_all_reduce_strategy_test.py | 249 ++---- .../python/parameter_server_strategy_test.py | 268 ++---- tensorflow/python/distribute/BUILD | 66 ++ .../collective_all_reduce_strategy_test.py | 592 +++++++++++++ .../parameter_server_strategy_test.py | 817 ++++++++++++++++++ 5 files changed, 1631 insertions(+), 361 deletions(-) create mode 100644 tensorflow/python/distribute/collective_all_reduce_strategy_test.py create mode 100644 tensorflow/python/distribute/parameter_server_strategy_test.py diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py index 6dda497459f..1f527340d8d 100644 --- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py +++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py @@ -32,11 +32,9 @@ from tensorflow.python.distribute import cross_device_ops as cross_device_ops_li from tensorflow.python.distribute import cross_device_utils from tensorflow.python.distribute import distribute_lib from tensorflow.python.distribute import multi_worker_test_base -from tensorflow.python.distribute import multi_worker_util from tensorflow.python.distribute import reduce_util from tensorflow.python.distribute import strategy_test_lib from tensorflow.python.distribute import values -from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver from tensorflow.python.eager import context from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes @@ -54,7 +52,6 @@ from tensorflow.python.ops.losses import losses from tensorflow.python.platform import test from tensorflow.python.training import adam from tensorflow.python.training import training_util -from tensorflow.python.training.server_lib import ClusterSpec class MockCollectiveAllReduceStrategy(distribute_lib.StrategyV1): @@ -71,38 +68,22 @@ class MockCollectiveAllReduceStrategy(distribute_lib.StrategyV1): def create_test_objects(cluster_spec=None, task_type=None, task_id=None, - num_gpus=None, - use_core_strategy=False): + num_gpus=None): sess_config = config_pb2.ConfigProto() if num_gpus is None: num_gpus = context.num_gpus() - if use_core_strategy: - if cluster_spec and task_type and task_id is not None: - cluster_resolver = SimpleClusterResolver( - cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec), - task_type=task_type, - task_id=task_id, - num_accelerators={'GPU': num_gpus}) - target = 'grpc://' + cluster_spec[task_type][task_id] - else: - cluster_resolver = SimpleClusterResolver( - ClusterSpec({}), num_accelerators={'GPU': num_gpus}) - target = '' - strategy = MockCollectiveAllReduceStrategy(cluster_resolver) - sess_config = strategy.update_config_proto(sess_config) + strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy( + num_gpus_per_worker=num_gpus) + if task_type and task_id is not None: + strategy.configure( + session_config=sess_config, + cluster_spec=cluster_spec, + task_type=task_type, + task_id=task_id) + target = 'grpc://' + cluster_spec[task_type][task_id] else: - strategy = collective_all_reduce_strategy.CollectiveAllReduceStrategy( - num_gpus_per_worker=num_gpus) - if task_type and task_id is not None: - strategy.configure( - session_config=sess_config, - cluster_spec=cluster_spec, - task_type=task_type, - task_id=task_id) - target = 'grpc://' + cluster_spec[task_type][task_id] - else: - target = '' + target = '' return strategy, target, sess_config @@ -120,17 +101,12 @@ class CollectiveAllReduceStrategyTestBase( CollectiveAllReduceStrategyTestBase.collective_key_base += 100000 super(CollectiveAllReduceStrategyTestBase, self).setUp() - def _get_test_object(self, - task_type, - task_id, - num_gpus=0, - use_core_strategy=False): + def _get_test_object(self, task_type, task_id, num_gpus=0): strategy, target, session_config = create_test_objects( cluster_spec=self._cluster_spec, task_type=task_type, task_id=task_id, - num_gpus=num_gpus, - use_core_strategy=use_core_strategy) + num_gpus=num_gpus) collective_keys = cross_device_utils.CollectiveKeys( group_key_start=10 + @@ -144,11 +120,7 @@ class CollectiveAllReduceStrategyTestBase( return strategy, target, session_config - def _test_minimize_loss_graph(self, - task_type, - task_id, - num_gpus, - use_core_strategy=False): + def _test_minimize_loss_graph(self, task_type, task_id, num_gpus): d, master_target, config = self._get_test_object(task_type, task_id, num_gpus) with ops.Graph().as_default(), \ @@ -215,11 +187,7 @@ class CollectiveAllReduceStrategyTestBase( # Error should go down self.assertLess(error_after, error_before) - def _test_complex_model(self, - task_type, - task_id, - num_gpus, - use_core_strategy=False): + def _test_complex_model(self, task_type, task_id, num_gpus): d, master_target, config = self._get_test_object(task_type, task_id, num_gpus) @@ -270,11 +238,7 @@ class CollectiveAllReduceStrategyTestBase( sess.run(variables.global_variables_initializer()) sess.run(train_op) - def _test_variable_initialization(self, - task_type, - task_id, - num_gpus, - use_core_strategy=False): + def _test_variable_initialization(self, task_type, task_id, num_gpus): distribution, master_target, config = self._get_test_object( task_type, task_id, num_gpus) with ops.Graph().as_default(), \ @@ -309,8 +273,7 @@ class CollectiveAllReduceStrategyTestBase( input_fn, expected_values, test_reinitialize=True, - ignore_order=False, - use_core_strategy=False): + ignore_order=False): distribution, master_target, config = self._get_test_object( task_type, task_id, num_gpus) devices = distribution.extended.worker_devices @@ -360,62 +323,41 @@ class DistributedCollectiveAllReduceStrategyTest( cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( num_workers=3, num_ps=0) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def test_num_replicas_in_sync(self, use_core_strategy): + @combinations.generate(combinations.combine(mode=['graph'])) + def test_num_replicas_in_sync(self): distribution, _, _ = create_test_objects( cluster_spec=self._cluster_spec, task_type='worker', task_id=0, - num_gpus=2, - use_core_strategy=use_core_strategy) + num_gpus=2) num_workers = len(self._cluster_spec.get('chief', []) + self._cluster_spec.get('worker', [])) self.assertEqual(2 * num_workers, distribution.num_replicas_in_sync) @combinations.generate( - combinations.combine( - mode=['graph'], - num_gpus=[0, 1, 2], - required_gpus=1, - use_core_strategy=[True, False])) - def testMinimizeLossGraph(self, num_gpus, use_core_strategy): - self._run_between_graph_clients( - self._test_minimize_loss_graph, - self._cluster_spec, - num_gpus, - use_core_strategy=use_core_strategy) + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1)) + def testMinimizeLossGraph(self, num_gpus): + self._run_between_graph_clients(self._test_minimize_loss_graph, + self._cluster_spec, num_gpus) @combinations.generate( - combinations.combine( - mode=['graph'], - num_gpus=[0, 1, 2], - required_gpus=1, - use_core_strategy=[True, False])) - def testVariableInitialization(self, num_gpus, use_core_strategy): + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1)) + def testVariableInitialization(self, num_gpus): if context.num_gpus() < num_gpus: self.skipTest('Not enough GPUs') self._run_between_graph_clients( self._test_variable_initialization, self._cluster_spec, - num_gpus=num_gpus, - use_core_strategy=use_core_strategy) + num_gpus=num_gpus) @combinations.generate( - combinations.combine( - mode=['graph'], - num_gpus=[0, 1, 2], - required_gpus=1, - use_core_strategy=[True, False])) - def testComplexModel(self, num_gpus, use_core_strategy): + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1)) + def testComplexModel(self, num_gpus): if context.num_gpus() < num_gpus: self.skipTest('Not enough GPUs') self._run_between_graph_clients( - self._test_complex_model, - self._cluster_spec, - num_gpus=num_gpus, - use_core_strategy=use_core_strategy) + self._test_complex_model, self._cluster_spec, num_gpus=num_gpus) # TODO(yuefengz): Update how we use num_gpus and required_gpus @combinations.generate( @@ -423,9 +365,8 @@ class DistributedCollectiveAllReduceStrategyTest( mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1, - use_dataset=[True, False], - use_core_strategy=[True, False])) - def testMakeInputFnIterator(self, num_gpus, use_dataset, use_core_strategy): + use_dataset=[True, False])) + def testMakeInputFnIterator(self, num_gpus, use_dataset): if context.num_gpus() < num_gpus: self.skipTest('Not enough GPUs') if use_dataset: @@ -452,17 +393,12 @@ class DistributedCollectiveAllReduceStrategyTest( input_fn, expected_values, test_reinitialize=use_dataset, - ignore_order=not use_dataset, - use_core_strategy=use_core_strategy) + ignore_order=not use_dataset) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testUpdateConfigProto(self, use_core_strategy): + @combinations.generate(combinations.combine(mode=['graph'])) + def testUpdateConfigProto(self): strategy, _, _ = self._get_test_object( - task_type='worker', - task_id=1, - num_gpus=2, - use_core_strategy=use_core_strategy) + task_type='worker', task_id=1, num_gpus=2) config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden']) rewrite_options = config_proto.graph_options.rewrite_options @@ -484,29 +420,6 @@ class DistributedCollectiveAllReduceStrategyTest( self.assertEqual(['CollectiveReduce'], new_rewrite_options.scoped_allocator_opts.enable_op) - @combinations.generate(combinations.combine(mode=['eager'])) - def testEnableCollectiveOps(self): - mock_called = [False] - - # pylint: disable=dangerous-default-value - def mock_enable_collective_ops(server_def, mock_called=mock_called): - self.assertEqual('worker', server_def.job_name) - self.assertEqual(1, server_def.task_index) - self.assertEqual('grpc', server_def.protocol) - mock_called[0] = True - - def mock_configure_collective_ops(*args, **kwargs): - del args, kwargs - - with test.mock.patch.object(context.context(), 'enable_collective_ops', - mock_enable_collective_ops), \ - test.mock.patch.object(context.context(), 'configure_collective_ops', - mock_configure_collective_ops): - strategy, _, _ = self._get_test_object( - task_type='worker', task_id=1, num_gpus=2, use_core_strategy=True) - self.assertTrue(strategy.extended._std_server_started) - self.assertTrue(mock_called[0]) - class DistributedCollectiveAllReduceStrategyTestWithChief( CollectiveAllReduceStrategyTestBase, parameterized.TestCase): @@ -550,41 +463,28 @@ class LocalCollectiveAllReduceStrategy( @combinations.generate( combinations.combine( - mode=['graph', 'eager'], - num_gpus=[2, 4], - required_gpus=2, - use_core_strategy=[True, False])) - def testMinimizeLoss(self, num_gpus, use_core_strategy): + mode=['graph', 'eager'], num_gpus=[2, 4], required_gpus=2)) + def testMinimizeLoss(self, num_gpus): # Collective ops doesn't support strategy with one device. if context.num_gpus() < num_gpus: self.skipTest('Not enough GPUs') if context.executing_eagerly(): - strategy, _, _ = self._get_test_object( - None, None, num_gpus, use_core_strategy=use_core_strategy) + strategy, _, _ = self._get_test_object(None, None, num_gpus) self._test_minimize_loss_eager(strategy) else: - self._test_minimize_loss_graph( - None, None, num_gpus, use_core_strategy=use_core_strategy) + self._test_minimize_loss_graph(None, None, num_gpus) @combinations.generate( - combinations.combine( - mode=['graph'], - num_gpus=[2, 4], - required_gpus=2, - use_core_strategy=[True, False])) - def testComplexModel(self, num_gpus, use_core_strategy): + combinations.combine(mode=['graph'], num_gpus=[2, 4], required_gpus=2)) + def testComplexModel(self, num_gpus): if context.num_gpus() < num_gpus: self.skipTest('Not enough GPUs') - self._test_complex_model( - None, None, num_gpus, use_core_strategy=use_core_strategy) + self._test_complex_model(None, None, num_gpus) @combinations.generate( combinations.combine( - mode=['graph', 'eager'], - required_gpus=2, - use_dataset=[True, False], - use_core_strategy=[True, False])) - def testMakeInputFnIterator(self, use_dataset, use_core_strategy): + mode=['graph', 'eager'], required_gpus=2, use_dataset=[True, False])) + def testMakeInputFnIterator(self, use_dataset): num_gpus = 2 if use_dataset: fn = lambda: dataset_ops.Dataset.range(5 * num_gpus) @@ -607,71 +507,56 @@ class LocalCollectiveAllReduceStrategy( input_fn, expected_values, test_reinitialize=use_dataset, - ignore_order=not use_dataset, - use_core_strategy=use_core_strategy) + ignore_order=not use_dataset) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testAllReduceSum(self, use_core_strategy): + @combinations.generate(combinations.combine(mode=['graph'])) + def testAllReduceSum(self): if context.num_gpus() < 2: self.skipTest('Not enough GPUs') - distribution, target, config = self._get_test_object( - None, None, num_gpus=2, use_core_strategy=use_core_strategy) + distribution, target, config = self._get_test_object(None, None, num_gpus=2) with self.cached_session(config=config, target=target): self._test_all_reduce_sum(distribution) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testAllReduceSumGradients(self, use_core_strategy): + @combinations.generate(combinations.combine(mode=['graph'])) + def testAllReduceSumGradients(self): if context.num_gpus() < 2: self.skipTest('Not enough GPUs') - distribution, target, config = self._get_test_object( - None, None, num_gpus=2, use_core_strategy=use_core_strategy) + distribution, target, config = self._get_test_object(None, None, num_gpus=2) with self.cached_session(config=config, target=target): self._test_all_reduce_sum_gradients(distribution) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testAllReduceSumGradientTape(self, use_core_strategy): + @combinations.generate(combinations.combine(mode=['graph'])) + def testAllReduceSumGradientTape(self): if context.num_gpus() < 2: self.skipTest('Not enough GPUs') - distribution, target, config = self._get_test_object( - None, None, num_gpus=2, use_core_strategy=use_core_strategy) + distribution, target, config = self._get_test_object(None, None, num_gpus=2) with self.cached_session(config=config, target=target): self._test_all_reduce_sum_gradient_tape(distribution) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testAllReduceMean(self, use_core_strategy): + @combinations.generate(combinations.combine(mode=['graph'])) + def testAllReduceMean(self): if context.num_gpus() < 2: self.skipTest('Not enough GPUs') - distribution, target, config = self._get_test_object( - None, None, num_gpus=2, use_core_strategy=use_core_strategy) + distribution, target, config = self._get_test_object(None, None, num_gpus=2) with self.cached_session(config=config, target=target): self._test_all_reduce_mean(distribution) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testAllReduceMeanGradients(self, use_core_strategy): + @combinations.generate(combinations.combine(mode=['graph'])) + def testAllReduceMeanGradients(self): if context.num_gpus() < 2: self.skipTest('Not enough GPUs') - distribution, target, config = self._get_test_object( - None, None, num_gpus=2, use_core_strategy=use_core_strategy) + distribution, target, config = self._get_test_object(None, None, num_gpus=2) with self.cached_session(config=config, target=target): self._test_all_reduce_mean_gradients(distribution) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testAllReduceMeanGradientTape(self, use_core_strategy): + @combinations.generate(combinations.combine(mode=['graph'])) + def testAllReduceMeanGradientTape(self): if context.num_gpus() < 2: self.skipTest('Not enough GPUs') - distribution, target, config = self._get_test_object( - None, None, num_gpus=2, use_core_strategy=use_core_strategy) + distribution, target, config = self._get_test_object(None, None, num_gpus=2) with self.cached_session(config=config, target=target): self._test_all_reduce_mean_gradient_tape(distribution) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testNumpyIterator(self, use_core_strategy): + @combinations.generate(combinations.combine(mode=['graph'])) + def testNumpyIterator(self): num_gpus = 2 if context.num_gpus() < num_gpus: self.skipTest('Not enough GPUs') - strategy, _, _ = self._get_test_object( - None, None, num_gpus=num_gpus, use_core_strategy=use_core_strategy) + strategy, _, _ = self._get_test_object(None, None, num_gpus=num_gpus) self._test_numpy_iterator(strategy) diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py index 12926cfa164..a4d5f0cf5a1 100644 --- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py +++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py @@ -24,17 +24,14 @@ from absl.testing import parameterized from tensorflow.contrib.distribute.python import parameter_server_strategy from tensorflow.core.protobuf import config_pb2 from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.distribute import central_storage_strategy from tensorflow.python.distribute import combinations from tensorflow.python.distribute import device_util from tensorflow.python.distribute import distribution_strategy_context as ds_context from tensorflow.python.distribute import multi_worker_test_base from tensorflow.python.distribute import multi_worker_util -from tensorflow.python.distribute import parameter_server_strategy as core_parameter_server_strategy from tensorflow.python.distribute import reduce_util from tensorflow.python.distribute import strategy_test_lib from tensorflow.python.distribute import values -from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver from tensorflow.python.eager import backprop from tensorflow.python.eager import context from tensorflow.python.estimator import run_config @@ -69,42 +66,24 @@ def create_test_objects(cluster_spec=None, task_type=None, task_id=None, num_gpus=None, - sess_config=None, - use_core_strategy=False): + sess_config=None): sess_config = sess_config or config_pb2.ConfigProto() if num_gpus is None: num_gpus = context.num_gpus() - if use_core_strategy: - if cluster_spec and task_type and task_id is not None: - cluster_resolver = SimpleClusterResolver( - cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec), - task_type=task_type, - task_id=task_id, - num_accelerators={'GPU': num_gpus}) - distribution = core_parameter_server_strategy.ParameterServerStrategy( - cluster_resolver) - target = 'grpc://' + cluster_spec[WORKER][task_id] - else: - distribution = ( - central_storage_strategy.CentralStorageStrategy._from_num_gpus( - num_gpus)) - target = '' + distribution = parameter_server_strategy.ParameterServerStrategy( + num_gpus_per_worker=num_gpus) + + if task_type: sess_config = copy.deepcopy(sess_config) - sess_config = distribution.update_config_proto(sess_config) + distribution.configure( + session_config=sess_config, + cluster_spec=cluster_spec, + task_type=task_type, + task_id=task_id) + target = 'grpc://' + cluster_spec[WORKER][task_id] else: - distribution = parameter_server_strategy.ParameterServerStrategy( - num_gpus_per_worker=num_gpus) - if task_type: - sess_config = copy.deepcopy(sess_config) - distribution.configure( - session_config=sess_config, - cluster_spec=cluster_spec, - task_type=task_type, - task_id=task_id) - target = 'grpc://' + cluster_spec[WORKER][task_id] - else: - target = '' + target = '' return distribution, target, sess_config @@ -122,27 +101,17 @@ class ParameterServerStrategyTestBase( self._sess_config = config_pb2.ConfigProto(allow_soft_placement=True) super(ParameterServerStrategyTestBase, self).setUp() - def _get_test_objects(self, - task_type, - task_id, - num_gpus, - use_core_strategy=False): + def _get_test_objects(self, task_type, task_id, num_gpus): return create_test_objects( cluster_spec=self._cluster_spec, task_type=task_type, task_id=task_id, num_gpus=num_gpus, - sess_config=self._sess_config, - use_core_strategy=use_core_strategy) + sess_config=self._sess_config) - def _test_device_assignment_distributed(self, - task_type, - task_id, - num_gpus, - use_core_strategy=False): + def _test_device_assignment_distributed(self, task_type, task_id, num_gpus): worker_device = '/job:%s/replica:0/task:%d' % (task_type, task_id) - d, _, sess_config = self._get_test_objects( - task_type, task_id, num_gpus, use_core_strategy=use_core_strategy) + d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus) with ops.Graph().as_default(), \ self.cached_session(target=self._default_target, config=sess_config) as sess, \ @@ -240,9 +209,8 @@ class ParameterServerStrategyTestBase( self.assertEqual(f_val, 46.0) def _test_device_assignment_distributed_enable_partitioner( - self, task_type, task_id, num_gpus, use_core_strategy=False): - d, _, sess_config = self._get_test_objects( - task_type, task_id, num_gpus, use_core_strategy=use_core_strategy) + self, task_type, task_id, num_gpus): + d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus) num_shards = len(d.extended.parameter_devices) partitioner = partitioned_variables.fixed_size_partitioner(num_shards) with ops.Graph().as_default(), \ @@ -390,13 +358,9 @@ class ParameterServerStrategyTestBase( self.assertEqual(z_val, 43.0) self.assertEqual(f_val, 46.0) - def _test_simple_increment(self, - task_type, - task_id, - num_gpus, - use_core_strategy=False): + def _test_simple_increment(self, task_type, task_id, num_gpus): d, master_target, sess_config = self._get_test_objects( - task_type, task_id, num_gpus, use_core_strategy=use_core_strategy) + task_type, task_id, num_gpus) if d.extended._cluster_spec: num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER)) if 'chief' in d.extended._cluster_spec.as_dict(): @@ -462,13 +426,9 @@ class ParameterServerStrategyTestBase( self.assertEqual(y_val, 20.0 + 1.0 * num_workers * d.num_replicas_in_sync) self.assertEqual(z_val, 30.0 + 1.0 * num_workers) - def _test_minimize_loss_graph(self, - task_type, - task_id, - num_gpus, - use_core_strategy=False): + def _test_minimize_loss_graph(self, task_type, task_id, num_gpus): d, master_target, sess_config = self._get_test_objects( - task_type, task_id, num_gpus, use_core_strategy=use_core_strategy) + task_type, task_id, num_gpus) if task_type: # Multi-worker assert hasattr(d.extended, '_cluster_spec') and d.extended._cluster_spec @@ -561,10 +521,9 @@ class ParameterServerStrategyTestBase( input_fn, expected_values, test_reinitialize=True, - ignore_order=False, - use_core_strategy=False): + ignore_order=False): distribution, master_target, config = self._get_test_objects( - task_type, task_id, num_gpus, use_core_strategy=use_core_strategy) + task_type, task_id, num_gpus) devices = distribution.extended.worker_devices with ops.Graph().as_default(), \ @@ -613,84 +572,62 @@ class ParameterServerStrategyTest( num_workers=3, num_ps=2) cls._default_target = 'grpc://' + cls._cluster_spec[WORKER][0] - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def test_num_replicas_in_sync(self, use_core_strategy): - strategy, _, _ = create_test_objects( - num_gpus=2, use_core_strategy=use_core_strategy) + @combinations.generate(combinations.combine(mode=['graph'])) + def test_num_replicas_in_sync(self): + strategy, _, _ = create_test_objects(num_gpus=2) # All the devices on a given worker are in sync which in this case is the # number of gpus on each worker. self.assertEqual(2, strategy.num_replicas_in_sync) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testDeviceAssignmentLocalCPU(self, use_core_strategy): - strategy, _, _ = create_test_objects( - num_gpus=0, use_core_strategy=use_core_strategy) + @combinations.generate(combinations.combine(mode=['graph'])) + def testDeviceAssignmentLocalCPU(self): + strategy, _, _ = create_test_objects(num_gpus=0) self._test_device_assignment_local( strategy, compute_device='CPU', variable_device='CPU', num_gpus=0) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testDeviceAssignmentLocalOneGPU(self, use_core_strategy): - strategy, _, _ = create_test_objects( - num_gpus=1, use_core_strategy=use_core_strategy) + @combinations.generate(combinations.combine(mode=['graph'])) + def testDeviceAssignmentLocalOneGPU(self): + strategy, _, _ = create_test_objects(num_gpus=1) self._test_device_assignment_local( strategy, compute_device='GPU', variable_device='GPU', num_gpus=1) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testDeviceAssignmentLocalTwoGPUs(self, use_core_strategy): - strategy, _, _ = create_test_objects( - num_gpus=2, use_core_strategy=use_core_strategy) + @combinations.generate(combinations.combine(mode=['graph'])) + def testDeviceAssignmentLocalTwoGPUs(self): + strategy, _, _ = create_test_objects(num_gpus=2) self._test_device_assignment_local( strategy, compute_device='GPU', variable_device='CPU', num_gpus=2) @combinations.generate( - combinations.combine( - mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False])) - def testDeviceAssignmentDistributed(self, num_gpus, use_core_strategy): - self._test_device_assignment_distributed( - 'worker', 1, num_gpus, use_core_strategy=use_core_strategy) + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) + def testDeviceAssignmentDistributed(self, num_gpus): + self._test_device_assignment_distributed('worker', 1, num_gpus) @combinations.generate( - combinations.combine( - mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False])) - def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus, - use_core_strategy): + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) + def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus): self._test_device_assignment_distributed_enable_partitioner( - 'worker', 1, num_gpus, use_core_strategy=use_core_strategy) + 'worker', 1, num_gpus) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testSimpleBetweenGraph(self): + self._run_between_graph_clients(self._test_simple_increment, + self._cluster_spec, context.num_gpus()) @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testSimpleBetweenGraph(self, use_core_strategy): - self._run_between_graph_clients( - self._test_simple_increment, - self._cluster_spec, - context.num_gpus(), - use_core_strategy=use_core_strategy) + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) + def testLocalSimpleIncrement(self, num_gpus): + self._test_simple_increment(None, 0, num_gpus) @combinations.generate( - combinations.combine( - mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False])) - def testLocalSimpleIncrement(self, num_gpus, use_core_strategy): - self._test_simple_increment(None, 0, num_gpus, use_core_strategy) + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) + def testMinimizeLossGraphDistributed(self, num_gpus): + self._run_between_graph_clients(self._test_minimize_loss_graph, + self._cluster_spec, num_gpus) @combinations.generate( - combinations.combine( - mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False])) - def testMinimizeLossGraphDistributed(self, num_gpus, use_core_strategy): - self._run_between_graph_clients( - self._test_minimize_loss_graph, - self._cluster_spec, - num_gpus, - use_core_strategy=use_core_strategy) - - @combinations.generate( - combinations.combine( - mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False])) - def testMinimizeLossGraphLocal(self, num_gpus, use_core_strategy): - self._test_minimize_loss_graph(None, None, num_gpus, use_core_strategy) + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) + def testMinimizeLossGraphLocal(self, num_gpus): + self._test_minimize_loss_graph(None, None, num_gpus) # TODO(priyag): Refactor this and other multi worker tests. @combinations.generate( @@ -698,10 +635,8 @@ class ParameterServerStrategyTest( mode=['graph'], num_gpus=[1, 2], required_gpus=1, - use_core_strategy=[True, False], use_dataset=[True, False])) - def testMakeInputFnIteratorDistributed( - self, num_gpus, use_core_strategy, use_dataset): + def testMakeInputFnIteratorDistributed(self, num_gpus, use_dataset): if context.num_gpus() < num_gpus: self.skipTest('Not enough GPUs') if use_dataset: @@ -726,18 +661,15 @@ class ParameterServerStrategyTest( input_fn, expected_values, test_reinitialize=use_dataset, - ignore_order=not use_dataset, - use_core_strategy=use_core_strategy) + ignore_order=not use_dataset) @combinations.generate( combinations.combine( mode=['graph'], num_gpus=[1, 2], required_gpus=1, - use_core_strategy=[True, False], use_dataset=[True, False])) - def testMakeInputFnIteratorLocal(self, num_gpus, use_core_strategy, - use_dataset): + def testMakeInputFnIteratorLocal(self, num_gpus, use_dataset): if context.num_gpus() < num_gpus: self.skipTest('Not enough GPUs') if use_dataset: @@ -762,24 +694,20 @@ class ParameterServerStrategyTest( input_fn, expected_values, test_reinitialize=use_dataset, - ignore_order=not use_dataset, - use_core_strategy=use_core_strategy) + ignore_order=not use_dataset) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testGlobalStepUpdate(self, use_core_strategy): - strategy, _, _ = create_test_objects(use_core_strategy=use_core_strategy) + @combinations.generate(combinations.combine(mode=['graph'])) + def testGlobalStepUpdate(self): + strategy, _, _ = create_test_objects() self._test_global_step_update(strategy) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testUpdateConfigProtoMultiWorker(self, use_core_strategy): + @combinations.generate(combinations.combine(mode=['graph'])) + def testUpdateConfigProtoMultiWorker(self): strategy, _, _ = create_test_objects( cluster_spec=self._cluster_spec, task_type='worker', task_id=1, - num_gpus=2, - use_core_strategy=use_core_strategy) + num_gpus=2) config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden']) @@ -792,11 +720,9 @@ class ParameterServerStrategyTest( # Verify isolate_session_state self.assertFalse(new_config.isolate_session_state) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testUpdateConfigProtoLocal(self, use_core_strategy): - strategy, _, _ = create_test_objects( - num_gpus=2, use_core_strategy=use_core_strategy) + @combinations.generate(combinations.combine(mode=['graph'])) + def testUpdateConfigProtoLocal(self): + strategy, _, _ = create_test_objects(num_gpus=2) config_proto = config_pb2.ConfigProto() new_config = strategy.update_config_proto(config_proto) @@ -854,30 +780,20 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase, num_workers=3, num_ps=2, has_chief=True) cls._default_target = 'grpc://' + cls._cluster_spec[CHIEF][0] - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testSimpleBetweenGraph(self, use_core_strategy): - self._run_between_graph_clients( - self._test_simple_increment, - self._cluster_spec, - context.num_gpus(), - use_core_strategy=use_core_strategy) + @combinations.generate(combinations.combine(mode=['graph'])) + def testSimpleBetweenGraph(self): + self._run_between_graph_clients(self._test_simple_increment, + self._cluster_spec, context.num_gpus()) @combinations.generate( - combinations.combine( - mode=['graph'], num_gpus=[0, 1, 2], use_core_strategy=[True, False])) - def testMinimizeLossGraph(self, num_gpus, use_core_strategy): - self._run_between_graph_clients( - self._test_minimize_loss_graph, - self._cluster_spec, - num_gpus, - use_core_strategy=use_core_strategy) + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) + def testMinimizeLossGraph(self, num_gpus): + self._run_between_graph_clients(self._test_minimize_loss_graph, + self._cluster_spec, num_gpus) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testGlobalStepIsWrappedOnTwoGPUs(self, use_core_strategy): - strategy, _, _ = create_test_objects( - num_gpus=2, use_core_strategy=use_core_strategy) + @combinations.generate(combinations.combine(mode=['graph'])) + def testGlobalStepIsWrappedOnTwoGPUs(self): + strategy, _, _ = create_test_objects(num_gpus=2) with ops.Graph().as_default(), strategy.scope(): created_step = training_util.create_global_step() get_step = training_util.get_global_step() @@ -889,11 +805,9 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase, self.assertIs(values.AggregatingVariable, type(get_step)) self.assertIs(strategy, created_step.distribute_strategy) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testGlobalStepIsNotWrappedOnOneGPU(self, use_core_strategy): - strategy, _, _ = create_test_objects( - num_gpus=1, use_core_strategy=use_core_strategy) + @combinations.generate(combinations.combine(mode=['graph'])) + def testGlobalStepIsNotWrappedOnOneGPU(self): + strategy, _, _ = create_test_objects(num_gpus=1) with ops.Graph().as_default(), strategy.scope(): created_step = training_util.create_global_step() get_step = training_util.get_global_step() @@ -908,11 +822,9 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase, self.assertFalse(hasattr(strategy, 'distribute_strategy')) self.assertIs(strategy, created_step._distribute_strategy) - @combinations.generate( - combinations.combine(mode=['graph'], use_core_strategy=[True, False])) - def testValueContainer(self, use_core_strategy): - strategy, _, _ = create_test_objects( - num_gpus=2, use_core_strategy=use_core_strategy) + @combinations.generate(combinations.combine(mode=['graph'])) + def testValueContainer(self): + strategy, _, _ = create_test_objects(num_gpus=2) with ops.Graph().as_default(), strategy.scope(): def f(): @@ -930,11 +842,9 @@ class CentralStorageStrategyTest(strategy_test_lib.DistributionTestBase, parameterized.TestCase): @combinations.generate(combinations.combine(mode=['graph', 'eager'], - use_core_strategy=[True, False], required_gpus=2)) - def testNumpyDataset(self, use_core_strategy): - strategy, _, _ = create_test_objects( - num_gpus=2, use_core_strategy=use_core_strategy) + def testNumpyDataset(self): + strategy, _, _ = create_test_objects(num_gpus=2) self._test_numpy_dataset(strategy) diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD index 91edc480673..899e5c45de7 100644 --- a/tensorflow/python/distribute/BUILD +++ b/tensorflow/python/distribute/BUILD @@ -1172,3 +1172,69 @@ distribute_py_test( "//tensorflow/python/eager:test", ], ) + +cuda_py_test( + name = "collective_all_reduce_strategy_test", + srcs = ["collective_all_reduce_strategy_test.py"], + additional_deps = [ + ":collective_all_reduce_strategy", + ":combinations", + ":strategy_combinations", + ":multi_worker_test_base", + ":strategy_test_lib", + "@absl_py//absl/testing:parameterized", + "//third_party/py/numpy", + "//tensorflow/core:protos_all_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:constant_op", + "//tensorflow/python:dtypes", + "//tensorflow/python:framework_ops", + "//tensorflow/python:gradients", + "//tensorflow/python:init_ops", + "//tensorflow/python:layers", + "//tensorflow/python:variable_scope", + "//tensorflow/python:variables", + "//tensorflow/python/distribute:cross_device_utils", + "//tensorflow/python/eager:context", + "//tensorflow/python/estimator:estimator_py", + ], + tags = [ + "multi_and_single_gpu", + ], +) + +cuda_py_test( + name = "parameter_server_strategy_test", + srcs = ["parameter_server_strategy_test.py"], + additional_deps = [ + ":parameter_server_strategy", + ":central_storage_strategy", + ":combinations", + ":strategy_combinations", + ":multi_worker_test_base", + ":strategy_test_lib", + "@absl_py//absl/testing:parameterized", + "//tensorflow/core:protos_all_py", + "//tensorflow/python:array_ops", + "//tensorflow/python:client_testlib", + "//tensorflow/python:constant_op", + "//tensorflow/python:control_flow_ops", + "//tensorflow/python:framework_ops", + "//tensorflow/python:gradients", + "//tensorflow/python:layers", + "//tensorflow/python:session", + "//tensorflow/python:tensor_util", + "//tensorflow/python:training", + "//tensorflow/python:variable_scope", + "//tensorflow/python:variables", + "//tensorflow/python/distribute:multi_worker_util", + "//tensorflow/python/distribute:values", + "//tensorflow/python/eager:context", + "//tensorflow/python/estimator:estimator_py", + ], + tags = [ + "multi_and_single_gpu", + "no_oss", # TODO(b/133330625) + ], +) diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py new file mode 100644 index 00000000000..f9e2a116641 --- /dev/null +++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py @@ -0,0 +1,592 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for CollectiveAllReduceStrategy.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl.testing import parameterized +import numpy as np + +from tensorflow.core.protobuf import config_pb2 +from tensorflow.core.protobuf import rewriter_config_pb2 +from tensorflow.python import keras +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.distribute import collective_all_reduce_strategy +from tensorflow.python.distribute import combinations +from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib +from tensorflow.python.distribute import cross_device_utils +from tensorflow.python.distribute import distribute_lib +from tensorflow.python.distribute import multi_worker_test_base +from tensorflow.python.distribute import multi_worker_util +from tensorflow.python.distribute import reduce_util +from tensorflow.python.distribute import strategy_test_lib +from tensorflow.python.distribute import values +from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver +from tensorflow.python.eager import context +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors +from tensorflow.python.framework import ops +from tensorflow.python.layers import core +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import gradients +from tensorflow.python.ops import init_ops +from tensorflow.python.ops import nn +from tensorflow.python.ops import random_ops +from tensorflow.python.ops import variable_scope +from tensorflow.python.ops import variables +from tensorflow.python.ops.losses import losses +from tensorflow.python.platform import test +from tensorflow.python.training import adam +from tensorflow.python.training import training_util +from tensorflow.python.training.server_lib import ClusterSpec + + +class MockCollectiveAllReduceStrategy(distribute_lib.StrategyV1): + """Mock the strategy to allow cluster resolver as an argument.""" + + def __init__(self, cluster_resolver): + super(MockCollectiveAllReduceStrategy, self).__init__( + collective_all_reduce_strategy.CollectiveAllReduceExtended( + self, + communication=cross_device_ops_lib.CollectiveCommunication.AUTO, + cluster_resolver=cluster_resolver)) + + +def create_test_objects(cluster_spec=None, + task_type=None, + task_id=None, + num_gpus=None): + sess_config = config_pb2.ConfigProto() + if num_gpus is None: + num_gpus = context.num_gpus() + + if cluster_spec and task_type and task_id is not None: + cluster_resolver = SimpleClusterResolver( + cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec), + task_type=task_type, + task_id=task_id, + num_accelerators={'GPU': num_gpus}) + target = 'grpc://' + cluster_spec[task_type][task_id] + else: + cluster_resolver = SimpleClusterResolver( + ClusterSpec({}), num_accelerators={'GPU': num_gpus}) + target = '' + + strategy = MockCollectiveAllReduceStrategy(cluster_resolver) + sess_config = strategy.update_config_proto(sess_config) + + return strategy, target, sess_config + + +class CollectiveAllReduceStrategyTestBase( + multi_worker_test_base.MultiWorkerTestBase): + + collective_key_base = 0 + + def setUp(self): + # We use a different key_base for each test so that collective keys won't be + # reused. + # TODO(yuefengz, ayushd): enable it to reuse collective keys in different + # tests. + CollectiveAllReduceStrategyTestBase.collective_key_base += 100000 + super(CollectiveAllReduceStrategyTestBase, self).setUp() + + def _get_test_object(self, task_type, task_id, num_gpus=0): + strategy, target, session_config = create_test_objects( + cluster_spec=self._cluster_spec, + task_type=task_type, + task_id=task_id, + num_gpus=num_gpus) + + collective_keys = cross_device_utils.CollectiveKeys( + group_key_start=10 + + CollectiveAllReduceStrategyTestBase.collective_key_base, + op_instance_key_start=100 + + CollectiveAllReduceStrategyTestBase.collective_key_base, + variable_instance_key_start=10000 + + CollectiveAllReduceStrategyTestBase.collective_key_base) + strategy.extended._collective_keys = collective_keys + strategy.extended._cross_device_ops._collective_keys = (collective_keys) + + return strategy, target, session_config + + def _test_minimize_loss_graph(self, task_type, task_id, num_gpus): + d, master_target, config = self._get_test_object(task_type, task_id, + num_gpus) + with ops.Graph().as_default(), \ + self.cached_session(config=config, + target=master_target) as sess, \ + d.scope(): + l = core.Dense(1, use_bias=False, + name='gpu_%d' % d.extended._num_gpus_per_worker) + + def loss_fn(x): + y = array_ops.reshape(l(x), []) - constant_op.constant(1.) + return y * y + + # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for + # multiple graphs (b/111216820). + def grad_fn(x): + loss = loss_fn(x) + var_list = ( + variables.trainable_variables() + ops.get_collection( + ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) + grads = gradients.gradients(loss, var_list) + ret = list(zip(grads, var_list)) + return ret + + def update(v, g): + return v.assign_sub(0.05 * g, use_locking=True) + + one = constant_op.constant([[1.]]) + + def step(): + """Perform one optimization step.""" + # Run forward & backward to get gradients, variables list. + g_v = d.extended.call_for_each_replica(grad_fn, args=[one]) + # Update the variables using the gradients and the update() function. + before_list = [] + after_list = [] + for g, v in g_v: + fetched = d.extended.read_var(v) + before_list.append(fetched) + with ops.control_dependencies([fetched]): + # TODO(yuefengz): support non-Mirrored variable as destinations. + g = d.extended.reduce_to( + reduce_util.ReduceOp.SUM, g, destinations=v) + with ops.control_dependencies( + d.extended.update(v, update, args=(g,), group=False)): + after_list.append(d.extended.read_var(v)) + return before_list, after_list + + before_out, after_out = step() + + if context.num_gpus() < d.extended._num_gpus_per_worker: + return True + + sess.run(variables.global_variables_initializer()) + + for i in range(10): + b, a = sess.run((before_out, after_out)) + if i == 0: + before, = b + after, = a + + error_before = abs(before - 1) + error_after = abs(after - 1) + # Error should go down + self.assertLess(error_after, error_before) + + def _test_complex_model(self, task_type, task_id, num_gpus): + d, master_target, config = self._get_test_object(task_type, task_id, + num_gpus) + + def model_fn(): + """Mnist model with synthetic input.""" + data_format = 'channels_last' + input_shape = [28, 28, 1] + l = keras.layers + max_pool = l.MaxPooling2D((2, 2), (2, 2), + padding='same', + data_format=data_format) + model = keras.Sequential([ + l.Reshape(target_shape=input_shape, input_shape=(28 * 28,)), + l.Conv2D( + 32, + 5, + padding='same', + data_format=data_format, + activation=nn.relu), max_pool, + l.Conv2D( + 64, + 5, + padding='same', + data_format=data_format, + activation=nn.relu), max_pool, + l.Flatten(), + l.Dense(1024, activation=nn.relu), + l.Dropout(0.4), + l.Dense(10) + ]) + image = random_ops.random_uniform([2, 28, 28]) + label = random_ops.random_uniform([2, 1], maxval=10, dtype=dtypes.int32) + logits = model(image, training=True) + # TODO(yuefengz): make loss a callable for eager mode. + loss = losses.sparse_softmax_cross_entropy(labels=label, logits=logits) + optimizer = adam.AdamOptimizer(learning_rate=1e-4) + train_op = optimizer.minimize(loss, + training_util.get_or_create_global_step()) + return train_op + + with ops.Graph().as_default(), \ + self.cached_session(config=config, + target=master_target) as sess: + with d.scope(): + train_op = d.extended.call_for_each_replica(model_fn) + train_op = d.group(d.experimental_local_results(train_op)) + + sess.run(variables.global_variables_initializer()) + sess.run(train_op) + + def _test_variable_initialization(self, task_type, task_id, num_gpus): + distribution, master_target, config = self._get_test_object( + task_type, task_id, num_gpus) + with ops.Graph().as_default(), \ + self.cached_session(config=config, + target=master_target) as sess, \ + distribution.scope(): + + def model_fn(): + x = variable_scope.get_variable( + 'x', + shape=(2, 3), + initializer=init_ops.random_uniform_initializer( + 1.0, 10.0, dtype=dtypes.float32)) + return array_ops.identity(x) + + x = distribution.extended.call_for_each_replica(model_fn) + reduced_x = distribution.reduce(reduce_util.ReduceOp.MEAN, x, axis=None) + x = distribution.experimental_local_results(x)[0] + + sess.run(variables.global_variables_initializer()) + + x_value, reduced_x_value = sess.run([x, reduced_x]) + self.assertTrue( + np.allclose(x_value, reduced_x_value, atol=1e-5), + msg=('x_value = %r, reduced_x_value = %r' % (x_value, + reduced_x_value))) + + def _test_input_fn_iterator(self, + task_type, + task_id, + num_gpus, + input_fn, + expected_values, + test_reinitialize=True, + ignore_order=False): + distribution, master_target, config = self._get_test_object( + task_type, task_id, num_gpus) + devices = distribution.extended.worker_devices + + with ops.Graph().as_default(), \ + self.cached_session(config=config, + target=master_target) as sess: + iterator = distribution.make_input_fn_iterator(input_fn) + sess.run(iterator.initialize()) + + for expected_value in expected_values: + next_element = iterator.get_next() + computed_value = sess.run([values.select_replica(r, next_element) + for r in range(len(devices))]) + if ignore_order: + self.assertCountEqual(expected_value, computed_value) + else: + self.assertEqual(expected_value, computed_value) + + with self.assertRaises(errors.OutOfRangeError): + next_element = iterator.get_next() + sess.run([values.select_replica(r, next_element) + for r in range(len(devices))]) + + # After re-initializing the iterator, should be able to iterate again. + if test_reinitialize: + sess.run(iterator.initialize()) + + for expected_value in expected_values: + next_element = iterator.get_next() + computed_value = sess.run([values.select_replica(r, next_element) + for r in range(len(devices))]) + if ignore_order: + self.assertCountEqual(expected_value, computed_value) + else: + self.assertEqual(expected_value, computed_value) + + +class DistributedCollectiveAllReduceStrategyTest( + CollectiveAllReduceStrategyTestBase, + strategy_test_lib.DistributionTestBase, + parameterized.TestCase): + + @classmethod + def setUpClass(cls): + """Create a local cluster with 3 workers.""" + cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( + num_workers=3, num_ps=0) + + @combinations.generate(combinations.combine(mode=['graph'])) + def test_num_replicas_in_sync(self): + distribution, _, _ = create_test_objects( + cluster_spec=self._cluster_spec, + task_type='worker', + task_id=0, + num_gpus=2) + num_workers = len(self._cluster_spec.get('chief', []) + + self._cluster_spec.get('worker', [])) + self.assertEqual(2 * num_workers, + distribution.num_replicas_in_sync) + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1)) + def testMinimizeLossGraph(self, num_gpus): + self._run_between_graph_clients(self._test_minimize_loss_graph, + self._cluster_spec, num_gpus) + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1)) + def testVariableInitialization(self, num_gpus): + if context.num_gpus() < num_gpus: + self.skipTest('Not enough GPUs') + self._run_between_graph_clients( + self._test_variable_initialization, + self._cluster_spec, + num_gpus=num_gpus) + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1)) + def testComplexModel(self, num_gpus): + if context.num_gpus() < num_gpus: + self.skipTest('Not enough GPUs') + self._run_between_graph_clients( + self._test_complex_model, self._cluster_spec, num_gpus=num_gpus) + + # TODO(yuefengz): Update how we use num_gpus and required_gpus + @combinations.generate( + combinations.combine( + mode=['graph'], + num_gpus=[0, 1, 2], + required_gpus=1, + use_dataset=[True, False])) + def testMakeInputFnIterator(self, num_gpus, use_dataset): + if context.num_gpus() < num_gpus: + self.skipTest('Not enough GPUs') + if use_dataset: + fn = lambda: dataset_ops.Dataset.range(100) + else: + def fn(): + dataset = dataset_ops.Dataset.range(100) + it = dataset.make_one_shot_iterator() + return it.get_next + # We use CPU as the device when num_gpus = 0 + devices_per_worker = max(1, num_gpus) + expected_values = [[i+j for j in range(devices_per_worker)] + for i in range(0, 100, devices_per_worker)] + + input_fn = self._input_fn_to_test_input_context( + fn, + expected_num_replicas_in_sync=3*devices_per_worker, + expected_num_input_pipelines=3, + expected_input_pipeline_id=1) # because task_id = 1 + self._test_input_fn_iterator( + 'worker', + 1, + num_gpus, + input_fn, + expected_values, + test_reinitialize=use_dataset, + ignore_order=not use_dataset) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testUpdateConfigProto(self): + strategy, _, _ = self._get_test_object( + task_type='worker', task_id=1, num_gpus=2) + + config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden']) + rewrite_options = config_proto.graph_options.rewrite_options + rewrite_options.scoped_allocator_opts.enable_op.append('to_be_removed') + + new_config = strategy.update_config_proto(config_proto) + + # Verify group leader + self.assertEqual('/job:worker/replica:0/task:0', + new_config.experimental.collective_group_leader) + + # Verify device filters. + self.assertEqual(['/job:worker/task:1'], new_config.device_filters) + + # Verify rewrite options. + new_rewrite_options = new_config.graph_options.rewrite_options + self.assertEqual(rewriter_config_pb2.RewriterConfig.ON, + new_rewrite_options.scoped_allocator_optimization) + self.assertEqual(['CollectiveReduce'], + new_rewrite_options.scoped_allocator_opts.enable_op) + + @combinations.generate(combinations.combine(mode=['eager'])) + def testEnableCollectiveOps(self): + mock_called = [False] + + # pylint: disable=dangerous-default-value + def mock_enable_collective_ops(server_def, mock_called=mock_called): + self.assertEqual('worker', server_def.job_name) + self.assertEqual(1, server_def.task_index) + self.assertEqual('grpc', server_def.protocol) + mock_called[0] = True + + def mock_configure_collective_ops(*args, **kwargs): + del args, kwargs + + with test.mock.patch.object(context.context(), 'enable_collective_ops', + mock_enable_collective_ops), \ + test.mock.patch.object(context.context(), 'configure_collective_ops', + mock_configure_collective_ops): + strategy, _, _ = self._get_test_object( + task_type='worker', task_id=1, num_gpus=2) + self.assertTrue(strategy.extended._std_server_started) + self.assertTrue(mock_called[0]) + + +class DistributedCollectiveAllReduceStrategyTestWithChief( + CollectiveAllReduceStrategyTestBase, parameterized.TestCase): + + @classmethod + def setUpClass(cls): + """Create a local cluster with 3 workers and 1 chief.""" + cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( + num_workers=3, num_ps=0, has_chief=True) + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1)) + def testMinimizeLossGraph(self, num_gpus): + self._run_between_graph_clients(self._test_minimize_loss_graph, + self._cluster_spec, num_gpus) + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1)) + def testVariableInitialization(self, num_gpus): + if context.num_gpus() < num_gpus: + return + self._run_between_graph_clients( + self._test_variable_initialization, + self._cluster_spec, + num_gpus=num_gpus) + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1)) + def testComplexModel(self, num_gpus): + if context.num_gpus() < num_gpus: + return + self._run_between_graph_clients( + self._test_complex_model, self._cluster_spec, num_gpus=num_gpus) + + +class LocalCollectiveAllReduceStrategy( + CollectiveAllReduceStrategyTestBase, + strategy_test_lib.DistributionTestBase, + strategy_test_lib.TwoDeviceDistributionTestBase, + parameterized.TestCase): + + @combinations.generate( + combinations.combine( + mode=['graph', 'eager'], num_gpus=[2, 4], required_gpus=2)) + def testMinimizeLoss(self, num_gpus): + # Collective ops doesn't support strategy with one device. + if context.num_gpus() < num_gpus: + self.skipTest('Not enough GPUs') + if context.executing_eagerly(): + strategy, _, _ = self._get_test_object(None, None, num_gpus) + self._test_minimize_loss_eager(strategy) + else: + self._test_minimize_loss_graph(None, None, num_gpus) + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[2, 4], required_gpus=2)) + def testComplexModel(self, num_gpus): + if context.num_gpus() < num_gpus: + self.skipTest('Not enough GPUs') + self._test_complex_model(None, None, num_gpus) + + @combinations.generate( + combinations.combine( + mode=['graph', 'eager'], required_gpus=2, use_dataset=[True, False])) + def testMakeInputFnIterator(self, use_dataset): + num_gpus = 2 + if use_dataset: + fn = lambda: dataset_ops.Dataset.range(5 * num_gpus) + else: + def fn(): + dataset = dataset_ops.Dataset.range(5 * num_gpus) + it = dataset.make_one_shot_iterator() + return it.get_next + expected_values = [range(i, i + num_gpus) for i in range(0, 10, num_gpus)] + + input_fn = self._input_fn_to_test_input_context( + fn, + expected_num_replicas_in_sync=num_gpus, + expected_num_input_pipelines=1, + expected_input_pipeline_id=0) + self._test_input_fn_iterator( + None, + None, + num_gpus, + input_fn, + expected_values, + test_reinitialize=use_dataset, + ignore_order=not use_dataset) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testAllReduceSum(self): + if context.num_gpus() < 2: self.skipTest('Not enough GPUs') + distribution, target, config = self._get_test_object(None, None, num_gpus=2) + with self.cached_session(config=config, target=target): + self._test_all_reduce_sum(distribution) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testAllReduceSumGradients(self): + if context.num_gpus() < 2: self.skipTest('Not enough GPUs') + distribution, target, config = self._get_test_object(None, None, num_gpus=2) + with self.cached_session(config=config, target=target): + self._test_all_reduce_sum_gradients(distribution) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testAllReduceSumGradientTape(self): + if context.num_gpus() < 2: self.skipTest('Not enough GPUs') + distribution, target, config = self._get_test_object(None, None, num_gpus=2) + with self.cached_session(config=config, target=target): + self._test_all_reduce_sum_gradient_tape(distribution) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testAllReduceMean(self): + if context.num_gpus() < 2: self.skipTest('Not enough GPUs') + distribution, target, config = self._get_test_object(None, None, num_gpus=2) + with self.cached_session(config=config, target=target): + self._test_all_reduce_mean(distribution) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testAllReduceMeanGradients(self): + if context.num_gpus() < 2: self.skipTest('Not enough GPUs') + distribution, target, config = self._get_test_object(None, None, num_gpus=2) + with self.cached_session(config=config, target=target): + self._test_all_reduce_mean_gradients(distribution) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testAllReduceMeanGradientTape(self): + if context.num_gpus() < 2: self.skipTest('Not enough GPUs') + distribution, target, config = self._get_test_object(None, None, num_gpus=2) + with self.cached_session(config=config, target=target): + self._test_all_reduce_mean_gradient_tape(distribution) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testNumpyIterator(self): + num_gpus = 2 + if context.num_gpus() < num_gpus: + self.skipTest('Not enough GPUs') + strategy, _, _ = self._get_test_object(None, None, num_gpus=num_gpus) + self._test_numpy_iterator(strategy) + + +if __name__ == '__main__': + test.main() diff --git a/tensorflow/python/distribute/parameter_server_strategy_test.py b/tensorflow/python/distribute/parameter_server_strategy_test.py new file mode 100644 index 00000000000..f8202fd050b --- /dev/null +++ b/tensorflow/python/distribute/parameter_server_strategy_test.py @@ -0,0 +1,817 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for ParameterServerStrategy.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import threading +from absl.testing import parameterized +from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.distribute import central_storage_strategy +from tensorflow.python.distribute import combinations +from tensorflow.python.distribute import device_util +from tensorflow.python.distribute import distribution_strategy_context as ds_context +from tensorflow.python.distribute import multi_worker_test_base +from tensorflow.python.distribute import multi_worker_util +from tensorflow.python.distribute import parameter_server_strategy +from tensorflow.python.distribute import reduce_util +from tensorflow.python.distribute import strategy_test_lib +from tensorflow.python.distribute import values +from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver +from tensorflow.python.eager import backprop +from tensorflow.python.eager import context +from tensorflow.python.estimator import run_config +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import errors +from tensorflow.python.framework import ops +from tensorflow.python.framework import tensor_util +from tensorflow.python.layers import core +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import gradients +from tensorflow.python.ops import partitioned_variables +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import variable_scope +from tensorflow.python.ops import variables +from tensorflow.python.platform import test +from tensorflow.python.training import training_util + +CHIEF = run_config.TaskType.CHIEF +WORKER = run_config.TaskType.WORKER +PS = run_config.TaskType.PS + + +def _get_replica_id_integer(): + replica_id = ds_context.get_replica_context().replica_id_in_sync_group + if isinstance(replica_id, ops.Tensor): + replica_id = tensor_util.constant_value(replica_id) + return replica_id + + +def create_test_objects(cluster_spec=None, + task_type=None, + task_id=None, + num_gpus=None, + sess_config=None): + sess_config = sess_config or config_pb2.ConfigProto() + if num_gpus is None: + num_gpus = context.num_gpus() + if cluster_spec and task_type and task_id is not None: + cluster_resolver = SimpleClusterResolver( + cluster_spec=multi_worker_util.normalize_cluster_spec(cluster_spec), + task_type=task_type, + task_id=task_id, + num_accelerators={'GPU': num_gpus}) + distribution = parameter_server_strategy.ParameterServerStrategy( + cluster_resolver) + target = 'grpc://' + cluster_spec[WORKER][task_id] + else: + distribution = ( + central_storage_strategy.CentralStorageStrategy._from_num_gpus(num_gpus) + ) + target = '' + + sess_config = copy.deepcopy(sess_config) + sess_config = distribution.update_config_proto(sess_config) + + return distribution, target, sess_config + + +class ParameterServerStrategyTestBase( + multi_worker_test_base.MultiWorkerTestBase): + + def setUp(self): + self._result = 0 + self._lock = threading.Lock() + self._init_condition = threading.Condition() + self._init_reached = 0 + self._finish_condition = threading.Condition() + self._finish_reached = 0 + self._sess_config = config_pb2.ConfigProto(allow_soft_placement=True) + super(ParameterServerStrategyTestBase, self).setUp() + + def _get_test_objects(self, task_type, task_id, num_gpus): + return create_test_objects( + cluster_spec=self._cluster_spec, + task_type=task_type, + task_id=task_id, + num_gpus=num_gpus, + sess_config=self._sess_config) + + def _test_device_assignment_distributed(self, task_type, task_id, num_gpus): + worker_device = '/job:%s/replica:0/task:%d' % (task_type, task_id) + d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus) + with ops.Graph().as_default(), \ + self.cached_session(target=self._default_target, + config=sess_config) as sess, \ + d.scope(): + + # Define a variable outside the call_for_each_replica scope. + n = variable_scope.get_variable('n', initializer=10.0) + self.assertEqual(n.device, '/job:ps/task:0') + + def model_fn(): + if num_gpus == 0: + last_part_device = 'device:CPU:0' + else: + replica_id = _get_replica_id_integer() + last_part_device = ('device:GPU:%d' % replica_id) + + a = constant_op.constant(1.0) + b = constant_op.constant(2.0) + c = a + b + self.assertEqual(a.device, worker_device + '/' + last_part_device) + self.assertEqual(b.device, worker_device + '/' + last_part_device) + self.assertEqual(c.device, worker_device + '/' + last_part_device) + + # The device scope is ignored for variables but not for normal ops. + with ops.device('/job:worker/task:0'): + x = variable_scope.get_variable( + 'x', initializer=10.0, + aggregation=variable_scope.VariableAggregation.SUM) + x_add = x.assign_add(c) + e = a + c + # The variable x is on the task 1 since the device_function has been + # called once before the model_fn. + self.assertEqual(x.device, '/job:ps/task:1') + self.assertEqual(x_add.device, x.device) + self.assertEqual(e.device, + '/job:worker/replica:0/task:0/%s' % last_part_device) + + # The colocate_vars_with can override the distribution's device. + with d.extended.colocate_vars_with(x): + y = variable_scope.get_variable( + 'y', initializer=20.0, + aggregation=variable_scope.VariableAggregation.SUM) + # We add an identity here to avoid complaints about summing + # non-distributed values. + y_add = y.assign_add(array_ops.identity(x_add)) + self.assertEqual(y.device, '/job:ps/task:1') + self.assertEqual(y_add.device, y.device) + self.assertEqual(y.device, x.device) + + z = variable_scope.get_variable( + 'z', initializer=10.0, + aggregation=variable_scope.VariableAggregation.SUM) + self.assertEqual(z.device, '/job:ps/task:0') + self.assertNotEqual(z.device, x.device) + + with ops.control_dependencies([y_add]): + # We add an identity here to avoid complaints about summing + # non-distributed values. + z_add = z.assign_add(array_ops.identity(y)) + with ops.control_dependencies([z_add]): + f = z + c + self.assertEqual(f.device, worker_device + '/' + last_part_device) + + # The device scope would merge with the default worker device. + with ops.device('/CPU:1'): + g = e + 1.0 + self.assertEqual(g.device, worker_device + '/device:CPU:1') + + # Ths ops.colocate_with will be ignored when defining a variale but not + # for a normal tensor. + with ops.colocate_with(x): + u = variable_scope.get_variable('u', initializer=30.0) + v = variable_scope.get_variable('v', initializer=30.0) + h = f + 1.0 + self.assertIn('/job:ps/', u.device) + self.assertIn('/job:ps/', v.device) + # u and v are on different parameter servers. + self.assertTrue(u.device != x.device or v.device != x.device) + self.assertTrue(u.device == x.device or v.device == x.device) + # Here h is not on one worker. Note h.device is canonical while x.device + # is not but. + self.assertIn('/job:ps/', h.device) + return y_add, z_add, f + + y, z, f = d.extended.call_for_each_replica(model_fn) + self.assertNotEqual(y, None) + self.assertNotEqual(z, None) + self.assertNotEqual(f, None) + + if context.num_gpus() >= 1 and num_gpus <= 1: + variables.global_variables_initializer().run() + y_val, z_val, f_val = sess.run([y, z, f]) + self.assertEqual(y_val, 33.0) + self.assertEqual(z_val, 43.0) + self.assertEqual(f_val, 46.0) + + def _test_device_assignment_distributed_enable_partitioner( + self, task_type, task_id, num_gpus): + d, _, sess_config = self._get_test_objects(task_type, task_id, num_gpus) + num_shards = len(d.extended.parameter_devices) + partitioner = partitioned_variables.fixed_size_partitioner(num_shards) + with ops.Graph().as_default(), \ + self.cached_session(target=self._default_target, + config=sess_config) as sess, \ + d.scope(): + + n = variable_scope.get_variable( + 'n', + initializer=constant_op.constant([10.0, 20.0]), + aggregation=variable_scope.VariableAggregation.SUM, + partitioner=partitioner) + + for part_id, var in enumerate(n): + self.assertEqual(var.device, '/job:ps/task:%d' % part_id) + + def model_fn(): + a = constant_op.constant([3.0, 5.0]) + # The device scope is ignored for variables but not for normal ops. + with ops.device('/job:worker/task:0'): + x = variable_scope.get_variable( + 'x', + initializer=constant_op.constant([10.0, 20.0]), + aggregation=variable_scope.VariableAggregation.SUM, + partitioner=partitioner) + x_add = x.assign_add(a, name='x_add') + # The variable x is on the task 1 since the device_function has been + # called once before the model_fn. + for part_id, var in enumerate(x): + self.assertEqual(var.device, '/job:ps/task:%d' % part_id) + self.assertEqual(var.device, x_add[part_id].device) + + return x_add + + x = d.extended.call_for_each_replica(model_fn) + + if context.num_gpus() >= 1: + variables.global_variables_initializer().run() + x_val = sess.run(x) + if num_gpus < 1: + self.assertEqual(x_val, [13.0, 25.0]) + else: + x_expect = [10.0 + 3 * num_gpus, 20.0 + 5 * num_gpus] + self.assertEqual(x_val, x_expect) + + def _test_device_assignment_local(self, + d, + compute_device='CPU', + variable_device='CPU', + num_gpus=0): + with ops.Graph().as_default(), \ + self.cached_session(target=self._default_target, + config=self._sess_config) as sess, \ + d.scope(): + + def model_fn(): + if 'CPU' in compute_device: + replica_compute_device = '/device:CPU:0' + else: + replica_id = _get_replica_id_integer() + replica_compute_device = ('/device:GPU:%d' % replica_id) + replica_compute_device = device_util.canonicalize( + replica_compute_device) + + if 'CPU' in variable_device: + replica_variable_device = '/device:CPU:0' + else: + replica_id = _get_replica_id_integer() + replica_variable_device = ('/device:GPU:%d' % replica_id) + replica_variable_device = device_util.canonicalize( + replica_variable_device) + + a = constant_op.constant(1.0) + b = constant_op.constant(2.0) + c = a + b + self.assertEqual(a.device, replica_compute_device) + self.assertEqual(b.device, replica_compute_device) + self.assertEqual(c.device, replica_compute_device) + + # The device scope is ignored for variables but not for normal ops. + with ops.device('/device:GPU:2'): + x = variable_scope.get_variable( + 'x', initializer=10.0, + aggregation=variable_scope.VariableAggregation.SUM) + x_add = x.assign_add(c) + e = a + c + self.assertEqual( + device_util.canonicalize(x.device), replica_variable_device) + self.assertEqual(x_add.device, x.device) + self.assertEqual(e.device, device_util.canonicalize('/device:GPU:2')) + + # The colocate_vars_with can override the distribution's device. + with d.extended.colocate_vars_with(x): + y = variable_scope.get_variable( + 'y', initializer=20.0, + aggregation=variable_scope.VariableAggregation.SUM) + # We add an identity here to avoid complaints about summing + # non-distributed values. + y_add = y.assign_add(array_ops.identity(x_add)) + self.assertEqual( + device_util.canonicalize(y.device), replica_variable_device) + self.assertEqual(y_add.device, y.device) + self.assertEqual(y.device, x.device) + + z = variable_scope.get_variable( + 'z', initializer=10.0, + aggregation=variable_scope.VariableAggregation.SUM) + self.assertEqual( + device_util.canonicalize(z.device), replica_variable_device) + + with ops.control_dependencies([y_add]): + # We add an identity here to avoid complaints about summing + # non-distributed values. + z_add = z.assign_add(array_ops.identity(y)) + with ops.control_dependencies([z_add]): + f = z + c + self.assertEqual(f.device, replica_compute_device) + + # The device scope would merge with the default worker device. + with ops.device('/CPU:1'): + g = e + 1.0 + self.assertEqual(g.device, device_util.canonicalize('/device:CPU:1')) + + # Ths ops.colocate_with will be ignored when defining a variale but not + # for a normal tensor. + with ops.colocate_with(x): + u = variable_scope.get_variable('u', initializer=30.0) + h = f + 1.0 + self.assertEqual( + device_util.canonicalize(u.device), replica_variable_device) + self.assertEqual( + device_util.canonicalize(x.device), + device_util.canonicalize(h.device)) + return y_add, z_add, f + + y, z, f = d.extended.call_for_each_replica(model_fn) + self.assertNotEqual(y, None) + self.assertNotEqual(z, None) + self.assertNotEqual(f, None) + + if context.num_gpus() >= 1 and num_gpus <= 1: + variables.global_variables_initializer().run() + y_val, z_val, f_val = sess.run([y, z, f]) + self.assertEqual(y_val, 33.0) + self.assertEqual(z_val, 43.0) + self.assertEqual(f_val, 46.0) + + def _test_simple_increment(self, task_type, task_id, num_gpus): + d, master_target, sess_config = self._get_test_objects( + task_type, task_id, num_gpus) + if d.extended._cluster_spec: + num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER)) + if 'chief' in d.extended._cluster_spec.as_dict(): + num_workers += 1 + else: + num_workers = 1 + with ops.Graph().as_default(), \ + self.cached_session(target=master_target, + config=sess_config) as sess, \ + d.scope(): + + def model_fn(): + x = variable_scope.get_variable( + 'x', initializer=10.0, + aggregation=variable_scope.VariableAggregation.SUM) + y = variable_scope.get_variable( + 'y', initializer=20.0, + aggregation=variable_scope.VariableAggregation.SUM) + z = variable_scope.get_variable( + 'z', initializer=30.0, + aggregation=variable_scope.VariableAggregation.ONLY_FIRST_REPLICA) + + # We explicitly make a constant tensor here to avoid complaints about + # summing non-distributed values. + one = constant_op.constant(1.0) + x_add = x.assign_add(one, use_locking=True) + y_add = y.assign_add(one, use_locking=True) + z_add = z.assign_add(one, use_locking=True) + + train_op = control_flow_ops.group(x_add, y_add, z_add) + return x, y, z, train_op + + x, y, z, train_op = d.extended.call_for_each_replica(model_fn) + train_op = d.group(train_op) + + if context.num_gpus() < sum( + 1 for d in d.extended.worker_devices if 'GPU' in d.upper()): + return True + + if task_id == 0: + variables.global_variables_initializer().run() + + # Workers waiting for chief worker's initializing variables. + self._init_condition.acquire() + self._init_reached += 1 + while self._init_reached != num_workers: + self._init_condition.wait() + self._init_condition.notify_all() + self._init_condition.release() + + sess.run(train_op) + + # Wait for other workers to finish training. + self._finish_condition.acquire() + self._finish_reached += 1 + while self._finish_reached != num_workers: + self._finish_condition.wait() + self._finish_condition.notify_all() + self._finish_condition.release() + + x_val, y_val, z_val = sess.run([x, y, z]) + self.assertEqual(x_val, 10.0 + 1.0 * num_workers * d.num_replicas_in_sync) + self.assertEqual(y_val, 20.0 + 1.0 * num_workers * d.num_replicas_in_sync) + self.assertEqual(z_val, 30.0 + 1.0 * num_workers) + + def _test_minimize_loss_graph(self, task_type, task_id, num_gpus): + d, master_target, sess_config = self._get_test_objects( + task_type, task_id, num_gpus) + if task_type: + # Multi-worker + assert hasattr(d.extended, '_cluster_spec') and d.extended._cluster_spec + num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER)) + if CHIEF in d.extended._cluster_spec.as_dict(): + num_workers += 1 + else: + # local + num_workers = 1 + + with ops.Graph().as_default(), \ + self.cached_session(target=master_target, + config=sess_config) as sess, \ + d.scope(): + l = core.Dense(1, use_bias=False) + + def loss_fn(x): + y = array_ops.reshape(l(x), []) - constant_op.constant(1.) + return y * y + + # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for + # multiple graphs (b/111216820). + def grad_fn(x): + loss = loss_fn(x) + var_list = ( + variables.trainable_variables() + ops.get_collection( + ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) + grads = gradients.gradients(loss, var_list) + ret = list(zip(grads, var_list)) + return ret + + def update(v, g): + return v.assign_sub(0.05 * g, use_locking=True) + + one = constant_op.constant([[1.]]) + + def step(): + """Perform one optimization step.""" + # Run forward & backward to get gradients, variables list. + g_v = d.extended.call_for_each_replica(grad_fn, args=(one,)) + # Update the variables using the gradients and the update() function. + before_list = [] + after_list = [] + for g, v in g_v: + fetched = d.extended.read_var(v) + before_list.append(fetched) + with ops.control_dependencies([fetched]): + # TODO(yuefengz): support non-Mirrored variable as destinations. + g = d.extended.reduce_to( + reduce_util.ReduceOp.SUM, g, destinations=v) + with ops.control_dependencies( + d.extended.update(v, update, args=(g,), group=False)): + after_list.append(d.extended.read_var(v)) + return before_list, after_list + + before_out, after_out = step() + + if context.num_gpus() < sum( + 1 for d in d.extended.worker_devices if 'GPU' in d.upper()): + return True + + if (not task_type or + multi_worker_util.is_chief( + d.extended._cluster_spec, task_type, task_id)): + variables.global_variables_initializer().run() + + # Workers waiting for chief worker's initializing variables. + self._init_condition.acquire() + self._init_reached += 1 + while self._init_reached != num_workers: + self._init_condition.wait() + self._init_condition.notify_all() + self._init_condition.release() + + for i in range(10): + b, a = sess.run((before_out, after_out)) + if i == 0: + before, = b + after, = a + + error_before = abs(before - 1) + error_after = abs(after - 1) + # Error should go down + self.assertLess(error_after, error_before) + + def _test_input_fn_iterator(self, + task_type, + task_id, + num_gpus, + input_fn, + expected_values, + test_reinitialize=True, + ignore_order=False): + distribution, master_target, config = self._get_test_objects( + task_type, task_id, num_gpus) + devices = distribution.extended.worker_devices + + with ops.Graph().as_default(), \ + self.cached_session(config=config, + target=master_target) as sess: + iterator = distribution.make_input_fn_iterator(input_fn) + sess.run(iterator.initialize()) + + for expected_value in expected_values: + next_element = iterator.get_next() + computed_value = sess.run([values.select_replica(r, next_element) + for r in range(len(devices))]) + if ignore_order: + self.assertCountEqual(expected_value, computed_value) + else: + self.assertEqual(expected_value, computed_value) + + with self.assertRaises(errors.OutOfRangeError): + next_element = iterator.get_next() + sess.run([values.select_replica(r, next_element) + for r in range(len(devices))]) + + # After re-initializing the iterator, should be able to iterate again. + if test_reinitialize: + sess.run(iterator.initialize()) + + for expected_value in expected_values: + next_element = iterator.get_next() + computed_value = sess.run([values.select_replica(r, next_element) + for r in range(len(devices))]) + if ignore_order: + self.assertCountEqual(expected_value, computed_value) + else: + self.assertEqual(expected_value, computed_value) + + +class ParameterServerStrategyTest( + ParameterServerStrategyTestBase, + strategy_test_lib.DistributionTestBase, + strategy_test_lib.TwoDeviceDistributionTestBase, + parameterized.TestCase): + + @classmethod + def setUpClass(cls): + cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( + num_workers=3, num_ps=2) + cls._default_target = 'grpc://' + cls._cluster_spec[WORKER][0] + + @combinations.generate(combinations.combine(mode=['graph'])) + def test_num_replicas_in_sync(self): + strategy, _, _ = create_test_objects(num_gpus=2) + # All the devices on a given worker are in sync which in this case is the + # number of gpus on each worker. + self.assertEqual(2, strategy.num_replicas_in_sync) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testDeviceAssignmentLocalCPU(self): + strategy, _, _ = create_test_objects(num_gpus=0) + self._test_device_assignment_local( + strategy, compute_device='CPU', variable_device='CPU', num_gpus=0) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testDeviceAssignmentLocalOneGPU(self): + strategy, _, _ = create_test_objects(num_gpus=1) + self._test_device_assignment_local( + strategy, compute_device='GPU', variable_device='GPU', num_gpus=1) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testDeviceAssignmentLocalTwoGPUs(self): + strategy, _, _ = create_test_objects(num_gpus=2) + self._test_device_assignment_local( + strategy, compute_device='GPU', variable_device='CPU', num_gpus=2) + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) + def testDeviceAssignmentDistributed(self, num_gpus): + self._test_device_assignment_distributed('worker', 1, num_gpus) + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) + def testDeviceAssignmentDistributedEnablePartitioner(self, num_gpus): + self._test_device_assignment_distributed_enable_partitioner( + 'worker', 1, num_gpus) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testSimpleBetweenGraph(self): + self._run_between_graph_clients(self._test_simple_increment, + self._cluster_spec, context.num_gpus()) + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) + def testLocalSimpleIncrement(self, num_gpus): + self._test_simple_increment(None, 0, num_gpus) + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) + def testMinimizeLossGraphDistributed(self, num_gpus): + self._run_between_graph_clients(self._test_minimize_loss_graph, + self._cluster_spec, num_gpus) + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) + def testMinimizeLossGraphLocal(self, num_gpus): + self._test_minimize_loss_graph(None, None, num_gpus) + + # TODO(priyag): Refactor this and other multi worker tests. + @combinations.generate( + combinations.combine( + mode=['graph'], + num_gpus=[1, 2], + required_gpus=1, + use_dataset=[True, False])) + def testMakeInputFnIteratorDistributed(self, num_gpus, use_dataset): + if context.num_gpus() < num_gpus: + self.skipTest('Not enough GPUs') + if use_dataset: + fn = lambda: dataset_ops.Dataset.range(100) + else: + def fn(): + dataset = dataset_ops.Dataset.range(100) + it = dataset.make_one_shot_iterator() + return it.get_next + expected_values = [[i+j for j in range(num_gpus)] + for i in range(0, 100, num_gpus)] + + input_fn = self._input_fn_to_test_input_context( + fn, + expected_num_replicas_in_sync=num_gpus, + expected_num_input_pipelines=3, + expected_input_pipeline_id=1) # because task_id = 1 + self._test_input_fn_iterator( + 'worker', + 1, + num_gpus, + input_fn, + expected_values, + test_reinitialize=use_dataset, + ignore_order=not use_dataset) + + @combinations.generate( + combinations.combine( + mode=['graph'], + num_gpus=[1, 2], + required_gpus=1, + use_dataset=[True, False])) + def testMakeInputFnIteratorLocal(self, num_gpus, use_dataset): + if context.num_gpus() < num_gpus: + self.skipTest('Not enough GPUs') + if use_dataset: + fn = lambda: dataset_ops.Dataset.range(100) + else: + def fn(): + dataset = dataset_ops.Dataset.range(100) + it = dataset.make_one_shot_iterator() + return it.get_next + expected_values = [[i+j for j in range(num_gpus)] + for i in range(0, 100, num_gpus)] + + input_fn = self._input_fn_to_test_input_context( + fn, + expected_num_replicas_in_sync=num_gpus, + expected_num_input_pipelines=1, + expected_input_pipeline_id=0) # only one worker and pipeline for local. + self._test_input_fn_iterator( + None, + None, + num_gpus, + input_fn, + expected_values, + test_reinitialize=use_dataset, + ignore_order=not use_dataset) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testGlobalStepUpdate(self): + strategy, _, _ = create_test_objects() + self._test_global_step_update(strategy) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testUpdateConfigProtoMultiWorker(self): + strategy, _, _ = create_test_objects( + cluster_spec=self._cluster_spec, + task_type='worker', + task_id=1, + num_gpus=2) + + config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden']) + + new_config = strategy.update_config_proto(config_proto) + + # Verify device filters. + self.assertEqual(['/job:worker/task:1', '/job:ps'], + new_config.device_filters) + + # Verify isolate_session_state + self.assertFalse(new_config.isolate_session_state) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testUpdateConfigProtoLocal(self): + strategy, _, _ = create_test_objects(num_gpus=2) + + config_proto = config_pb2.ConfigProto() + new_config = strategy.update_config_proto(config_proto) + + # Verify isolate_session_state + self.assertTrue(new_config.isolate_session_state) + + +class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase, + parameterized.TestCase): + + @classmethod + def setUpClass(cls): + cls._cluster_spec = multi_worker_test_base.create_in_process_cluster( + num_workers=3, num_ps=2, has_chief=True) + cls._default_target = 'grpc://' + cls._cluster_spec[CHIEF][0] + + @combinations.generate(combinations.combine(mode=['graph'])) + def testSimpleBetweenGraph(self): + self._run_between_graph_clients(self._test_simple_increment, + self._cluster_spec, context.num_gpus()) + + @combinations.generate( + combinations.combine(mode=['graph'], num_gpus=[0, 1, 2])) + def testMinimizeLossGraph(self, num_gpus): + self._run_between_graph_clients(self._test_minimize_loss_graph, + self._cluster_spec, num_gpus) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testGlobalStepIsWrappedOnTwoGPUs(self): + strategy, _, _ = create_test_objects(num_gpus=2) + with ops.Graph().as_default(), strategy.scope(): + created_step = training_util.create_global_step() + get_step = training_util.get_global_step() + self.assertEqual(created_step, get_step, + msg=('created_step %s type %s vs. get_step %s type %s' % + (id(created_step), created_step.__class__.__name__, + id(get_step), get_step.__class__.__name__))) + self.assertIs(values.AggregatingVariable, type(created_step)) + self.assertIs(values.AggregatingVariable, type(get_step)) + self.assertIs(strategy, created_step.distribute_strategy) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testGlobalStepIsNotWrappedOnOneGPU(self): + strategy, _, _ = create_test_objects(num_gpus=1) + with ops.Graph().as_default(), strategy.scope(): + created_step = training_util.create_global_step() + get_step = training_util.get_global_step() + self.assertEqual(created_step, get_step, + msg=('created_step %s type %s vs. get_step %s type %s' % + (id(created_step), created_step.__class__.__name__, + id(get_step), get_step.__class__.__name__))) + self.assertIs(resource_variable_ops.ResourceVariable, type(created_step)) + self.assertIs(resource_variable_ops.ResourceVariable, type(get_step)) + # All variables have an _distribute_strategy parameter. Only variable + # subclasses in distribution strategy expose it publicly. + self.assertFalse(hasattr(strategy, 'distribute_strategy')) + self.assertIs(strategy, created_step._distribute_strategy) + + @combinations.generate(combinations.combine(mode=['graph'])) + def testValueContainer(self): + strategy, _, _ = create_test_objects(num_gpus=2) + with ops.Graph().as_default(), strategy.scope(): + + def f(): + with backprop.GradientTape() as tape: + v = variable_scope.get_variable('v', initializer=10.0) + _ = v * v + v, = tape.watched_variables() + w = strategy.extended.value_container(v) + self.assertIs(values.AggregatingVariable, type(w)) + + strategy.extended.call_for_each_replica(f) + + +class CentralStorageStrategyTest(strategy_test_lib.DistributionTestBase, + parameterized.TestCase): + + @combinations.generate(combinations.combine(mode=['graph', 'eager'], + required_gpus=2)) + def testNumpyDataset(self): + strategy, _, _ = create_test_objects(num_gpus=2) + self._test_numpy_dataset(strategy) + + +if __name__ == '__main__': + test.main() From b71bdb8980b3050bf7a147f060dcc23b352c6d7b Mon Sep 17 00:00:00 2001 From: Bruce Fontaine Date: Tue, 23 Jul 2019 09:42:40 -0700 Subject: [PATCH 0385/3053] Fix test for mod partitioning of embedding tables on CPU in TPUEstimator. Fix bug with sequence columns in a shared embedding. PiperOrigin-RevId: 259553876 --- tensorflow/python/tpu/feature_column_v2.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/tpu/feature_column_v2.py b/tensorflow/python/tpu/feature_column_v2.py index afc7e6173f9..8a5535591e4 100644 --- a/tensorflow/python/tpu/feature_column_v2.py +++ b/tensorflow/python/tpu/feature_column_v2.py @@ -534,8 +534,8 @@ class _TPUSharedEmbeddingColumnV2(_TPUBaseEmbeddingColumn, return fc_lib.SharedEmbeddingColumn.get_sequence_dense_tensor( self, transformation_cache, state_manager) - tensor = fc_lib.SharedEmbeddingColumn._dense_tensor_internal( - self, transformation_cache, state_manager) + tensor = self._get_dense_tensor_internal( + transformation_cache, state_manager) tensor_lengths = transformation_cache.get( self.get_sequence_length_feature_key_name(), state_manager) From af07a124fa8844e3094cafdd922840ee512ff0f1 Mon Sep 17 00:00:00 2001 From: Frank Chen Date: Tue, 23 Jul 2019 09:55:12 -0700 Subject: [PATCH 0386/3053] Updating URLs and checksums of some downloads. PiperOrigin-RevId: 259556121 --- .../tutorials/word2vec/word2vec_basic.py | 22 +++++++++++++++++-- tensorflow/python/keras/datasets/cifar10.py | 7 +++++- tensorflow/python/keras/datasets/cifar100.py | 7 +++++- tensorflow/python/keras/datasets/imdb.py | 3 ++- tensorflow/python/keras/datasets/mnist.py | 3 ++- tensorflow/python/keras/datasets/reuters.py | 3 ++- 6 files changed, 38 insertions(+), 7 deletions(-) diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py index 380cd2be515..d48e7689fa8 100644 --- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py +++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py @@ -20,6 +20,7 @@ from __future__ import print_function import argparse import collections +import hashlib import math import os import random @@ -37,6 +38,14 @@ from tensorflow.contrib.tensorboard.plugins import projector data_index = 0 +def _hash_file(fpath): + hasher = hashlib.sha256() + with open(fpath, 'rb') as fpath_file: + for chunk in iter(lambda: fpath_file.read(65535), b''): + hasher.update(chunk) + return hasher.hexdigest() + + def word2vec_basic(log_dir): """Example of building, training and visualizing a word2vec model.""" # Create the directory for TensorBoard variables if there is not. @@ -44,16 +53,22 @@ def word2vec_basic(log_dir): os.makedirs(log_dir) # Step 1: Download the data. + # Note: Source website does not support HTTPS right now. url = 'http://mattmahoney.net/dc/' # pylint: disable=redefined-outer-name - def maybe_download(filename, expected_bytes): + def maybe_download(filename, expected_bytes, sha256=None): """Download a file if not present, and make sure it's the right size.""" local_filename = os.path.join(gettempdir(), filename) if not os.path.exists(local_filename): local_filename, _ = urllib.request.urlretrieve(url + filename, local_filename) statinfo = os.stat(local_filename) + + if sha256 and _hash_file(local_filename) != sha256: + raise Exception('Failed to verify ' + local_filename + ' due to hash ' + 'mismatch. Can you get to it with a browser?') + if statinfo.st_size == expected_bytes: print('Found and verified', filename) else: @@ -62,7 +77,10 @@ def word2vec_basic(log_dir): '. Can you get to it with a browser?') return local_filename - filename = maybe_download('text8.zip', 31344016) + filename = maybe_download( + 'text8.zip', + 31344016, + sha256='a6640522afe85d1963ad56c05b0ede0a0c000dddc9671758a6cc09b7a38e5232') # Read the data into a list of strings. def read_data(filename): diff --git a/tensorflow/python/keras/datasets/cifar10.py b/tensorflow/python/keras/datasets/cifar10.py index c23f1a263bb..f7606b657f5 100644 --- a/tensorflow/python/keras/datasets/cifar10.py +++ b/tensorflow/python/keras/datasets/cifar10.py @@ -37,7 +37,12 @@ def load_data(): """ dirname = 'cifar-10-batches-py' origin = 'https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz' - path = get_file(dirname, origin=origin, untar=True) + path = get_file( + dirname, + origin=origin, + untar=True, + file_hash= + '6d958be074577803d12ecdefd02955f39262c83c16fe9348329d7fe0b5c001ce') num_train_samples = 50000 diff --git a/tensorflow/python/keras/datasets/cifar100.py b/tensorflow/python/keras/datasets/cifar100.py index ee58d46228c..499188a5e0b 100644 --- a/tensorflow/python/keras/datasets/cifar100.py +++ b/tensorflow/python/keras/datasets/cifar100.py @@ -46,7 +46,12 @@ def load_data(label_mode='fine'): dirname = 'cifar-100-python' origin = 'https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz' - path = get_file(dirname, origin=origin, untar=True) + path = get_file( + dirname, + origin=origin, + untar=True, + file_hash= + '85cd44d02ba6437773c5bbd22e183051d648de2e7d6b014e1ef29b855ba677a7') fpath = os.path.join(path, 'train') x_train, y_train = load_batch(fpath, label_key=label_mode + '_labels') diff --git a/tensorflow/python/keras/datasets/imdb.py b/tensorflow/python/keras/datasets/imdb.py index e3a03c8d55d..d9f209add01 100644 --- a/tensorflow/python/keras/datasets/imdb.py +++ b/tensorflow/python/keras/datasets/imdb.py @@ -81,7 +81,8 @@ def load_data(path='imdb.npz', path = get_file( path, origin=origin_folder + 'imdb.npz', - file_hash='599dadb1135973df5b59232a0e9a887c') + file_hash= + '69664113be75683a8fe16e3ed0ab59fda8886cb3cd7ada244f7d9544e4676b9f') with np.load(path, allow_pickle=True) as f: x_train, labels_train = f['x_train'], f['y_train'] x_test, labels_test = f['x_test'], f['y_test'] diff --git a/tensorflow/python/keras/datasets/mnist.py b/tensorflow/python/keras/datasets/mnist.py index bad41a51642..7e012c3c0d4 100644 --- a/tensorflow/python/keras/datasets/mnist.py +++ b/tensorflow/python/keras/datasets/mnist.py @@ -46,7 +46,8 @@ def load_data(path='mnist.npz'): path = get_file( path, origin=origin_folder + 'mnist.npz', - file_hash='8a61469f7ea1b51cbae51d4f78837e45') + file_hash= + '731c5ac602752760c8e48fbffcf8c3b850d9dc2a2aedcf2cc48468fc17b673d1') with np.load(path) as f: x_train, y_train = f['x_train'], f['y_train'] x_test, y_test = f['x_test'], f['y_test'] diff --git a/tensorflow/python/keras/datasets/reuters.py b/tensorflow/python/keras/datasets/reuters.py index 560b697dff2..e1aa1f5d185 100644 --- a/tensorflow/python/keras/datasets/reuters.py +++ b/tensorflow/python/keras/datasets/reuters.py @@ -79,7 +79,8 @@ def load_data(path='reuters.npz', path = get_file( path, origin=origin_folder + 'reuters.npz', - file_hash='87aedbeb0cb229e378797a632c1997b6') + file_hash= + 'd6586e694ee56d7a4e65172e12b3e987c03096cb01eab99753921ef915959916') with np.load(path, allow_pickle=True) as f: xs, labels = f['x'], f['y'] From 8b0c84d30d957596cbb3bcac9245e114c3f0b65b Mon Sep 17 00:00:00 2001 From: Shanqing Cai Date: Tue, 23 Jul 2019 09:55:54 -0700 Subject: [PATCH 0387/3053] [tfdbg] Improve how examples binaries handle config file paths PiperOrigin-RevId: 259556240 --- tensorflow/python/debug/BUILD | 1 - tensorflow/python/debug/examples/debug_errors.py | 5 ++++- tensorflow/python/debug/examples/debug_keras.py | 16 +++++++++++++++- tensorflow/python/debug/examples/debug_mnist.py | 5 ++++- .../python/debug/examples/debug_tflearn_iris.py | 13 ++++++++++++- .../python/debug/examples/examples_test.sh | 6 +++--- tensorflow/python/debug/wrappers/hooks.py | 12 ++++++++++-- .../python/debug/wrappers/local_cli_wrapper.py | 11 +++++------ 8 files changed, 53 insertions(+), 16 deletions(-) diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD index 8d9e6b0e67c..86b94784f53 100644 --- a/tensorflow/python/debug/BUILD +++ b/tensorflow/python/debug/BUILD @@ -1171,7 +1171,6 @@ sh_test( ":offline_analyzer", ], tags = [ - "no_oss", # TODO(b/137652456): remove when fixed "no_windows", ], ) diff --git a/tensorflow/python/debug/examples/debug_errors.py b/tensorflow/python/debug/examples/debug_errors.py index 9f75e6a2c27..bf224d0ce53 100644 --- a/tensorflow/python/debug/examples/debug_errors.py +++ b/tensorflow/python/debug/examples/debug_errors.py @@ -19,6 +19,7 @@ from __future__ import print_function import argparse import sys +import tempfile import numpy as np import tensorflow as tf @@ -41,10 +42,12 @@ def main(_): z = tf.matmul(m, v, name="z") if FLAGS.debug: + config_file_path = (tempfile.mktemp(".tfdbg_config") + if FLAGS.use_random_config_path else None) sess = tf_debug.LocalCLIDebugWrapperSession( sess, ui_type=FLAGS.ui_type, - use_random_config_path=FLAGS.use_random_config_path) + config_file_path=config_file_path) if FLAGS.error == "shape_mismatch": print(sess.run(y, feed_dict={ph_float: np.array([[0.0], [1.0], [2.0]])})) diff --git a/tensorflow/python/debug/examples/debug_keras.py b/tensorflow/python/debug/examples/debug_keras.py index 019121fa0a6..f24ef58b0b2 100644 --- a/tensorflow/python/debug/examples/debug_keras.py +++ b/tensorflow/python/debug/examples/debug_keras.py @@ -20,6 +20,7 @@ from __future__ import print_function import argparse import sys +import tempfile import numpy as np import tensorflow as tf @@ -41,7 +42,12 @@ def main(_): sess = tf.Session() if FLAGS.debug: # Use the command-line interface (CLI) of tfdbg. - sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type=FLAGS.ui_type) + config_file_path = (tempfile.mktemp(".tfdbg_config") + if FLAGS.use_random_config_path else None) + sess = tf_debug.LocalCLIDebugWrapperSession( + sess, + ui_type=FLAGS.ui_type, + config_file_path=config_file_path) elif FLAGS.tensorboard_debug_address: # Use the TensorBoard Debugger Plugin (GUI of tfdbg). sess = tf_debug.TensorBoardDebugWrapperSession( @@ -73,6 +79,14 @@ if __name__ == "__main__": type=str, default="curses", help="Command-line user interface type (curses | readline).") + parser.add_argument( + "--use_random_config_path", + type="bool", + nargs="?", + const=True, + default=False, + help="""If set, set config file path to a random file in the temporary + directory.""") parser.add_argument( "--tensorboard_debug_address", type=str, diff --git a/tensorflow/python/debug/examples/debug_mnist.py b/tensorflow/python/debug/examples/debug_mnist.py index 58979619032..8a31e3eae7a 100644 --- a/tensorflow/python/debug/examples/debug_mnist.py +++ b/tensorflow/python/debug/examples/debug_mnist.py @@ -26,6 +26,7 @@ from __future__ import print_function import argparse import sys +import tempfile import tensorflow as tf @@ -125,10 +126,12 @@ def main(_): "The --debug and --tensorboard_debug_address flags are mutually " "exclusive.") if FLAGS.debug: + config_file_path = (tempfile.mktemp(".tfdbg_config") + if FLAGS.use_random_config_path else None) sess = tf_debug.LocalCLIDebugWrapperSession( sess, ui_type=FLAGS.ui_type, - use_random_config_path=FLAGS.use_random_config_path) + config_file_path=config_file_path) elif FLAGS.tensorboard_debug_address: sess = tf_debug.TensorBoardDebugWrapperSession( sess, FLAGS.tensorboard_debug_address) diff --git a/tensorflow/python/debug/examples/debug_tflearn_iris.py b/tensorflow/python/debug/examples/debug_tflearn_iris.py index be9a62311b6..d05f01c9ecc 100644 --- a/tensorflow/python/debug/examples/debug_tflearn_iris.py +++ b/tensorflow/python/debug/examples/debug_tflearn_iris.py @@ -58,8 +58,11 @@ def main(_): "exclusive.") hooks = [] if FLAGS.debug: + config_file_path = (tempfile.mktemp(".tfdbg_config") + if FLAGS.use_random_config_path else None) hooks.append(tf_debug.LocalCLIDebugHook(ui_type=FLAGS.ui_type, - dump_root=FLAGS.dump_root)) + dump_root=FLAGS.dump_root, + config_file_path=config_file_path)) elif FLAGS.tensorboard_debug_address: hooks.append(tf_debug.TensorBoardDebugHook(FLAGS.tensorboard_debug_address)) @@ -122,6 +125,14 @@ if __name__ == "__main__": type=str, default="", help="Optional custom root directory for temporary debug dump data") + parser.add_argument( + "--use_random_config_path", + type="bool", + nargs="?", + const=True, + default=False, + help="""If set, set config file path to a random file in the temporary + directory.""") parser.add_argument( "--tensorboard_debug_address", type=str, diff --git a/tensorflow/python/debug/examples/examples_test.sh b/tensorflow/python/debug/examples/examples_test.sh index 727bc702af6..397d8d5c281 100755 --- a/tensorflow/python/debug/examples/examples_test.sh +++ b/tensorflow/python/debug/examples/examples_test.sh @@ -87,7 +87,7 @@ EOF CUSTOM_DUMP_ROOT=$(mktemp -d) mkdir -p ${CUSTOM_DUMP_ROOT} -cat << EOF | ${DEBUG_TFLEARN_IRIS_BIN} --debug --train_steps=2 --dump_root="${CUSTOM_DUMP_ROOT}" --ui_type=readline +cat << EOF | ${DEBUG_TFLEARN_IRIS_BIN} --debug --train_steps=2 --dump_root="${CUSTOM_DUMP_ROOT}" --ui_type=readline --use_random_config_path run -p run -f has_inf_or_nan EOF @@ -99,12 +99,12 @@ if [[ -d "${CUSTOM_DUMP_ROOT}" ]]; then fi # Test debugging of tf.keras. -cat << EOF | ${DEBUG_KERAS_BIN} --debug --ui_type=readline +cat << EOF | ${DEBUG_KERAS_BIN} --debug --ui_type=readline --use_random_config_path run -f has_inf_or_nan EOF # Test debugging of tf.keras, with non-debug runs included. -cat << EOF | ${DEBUG_KERAS_BIN} --debug --ui_type=readline +cat << EOF | ${DEBUG_KERAS_BIN} --debug --ui_type=readline --use_random_config_path run -t 10 EOF diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py index 76d5ad28e04..4c958be257c 100644 --- a/tensorflow/python/debug/wrappers/hooks.py +++ b/tensorflow/python/debug/wrappers/hooks.py @@ -36,7 +36,11 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook): available. """ - def __init__(self, ui_type="curses", dump_root=None, thread_name_filter=None): + def __init__(self, + ui_type="curses", + dump_root=None, + thread_name_filter=None, + config_file_path=None): """Create a local debugger command-line interface (CLI) hook. Args: @@ -49,6 +53,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook): thread_name_filter: Regular-expression white list for threads on which the wrapper session will be active. See doc of `BaseDebugWrapperSession` for more details. + config_file_path: Optional override to the default configuration file + path, which is at `${HOME}/.tfdbg_config`. """ self._ui_type = ui_type @@ -56,6 +62,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook): self._thread_name_filter = thread_name_filter self._session_wrapper = None self._pending_tensor_filters = {} + self._config_file_path = config_file_path def add_tensor_filter(self, filter_name, tensor_filter): """Add a tensor filter. @@ -87,7 +94,8 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook): run_context.session, ui_type=self._ui_type, dump_root=self._dump_root, - thread_name_filter=self._thread_name_filter) + thread_name_filter=self._thread_name_filter, + config_file_path=self._config_file_path) # Actually register tensor filters registered prior to the construction # of the underlying LocalCLIDebugWrapperSession object. diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper.py b/tensorflow/python/debug/wrappers/local_cli_wrapper.py index 85a282ef33f..5f7fec5bfab 100644 --- a/tensorflow/python/debug/wrappers/local_cli_wrapper.py +++ b/tensorflow/python/debug/wrappers/local_cli_wrapper.py @@ -54,7 +54,7 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession): log_usage=True, ui_type="curses", thread_name_filter=None, - use_random_config_path=False): + config_file_path=False): """Constructor of LocalCLIDebugWrapperSession. Args: @@ -69,8 +69,8 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession): (curses | readline) thread_name_filter: Regular-expression white list for thread name. See the doc of `BaseDebugWrapperSession` for details. - use_random_config_path: If true, set config file path to a random file in - the temporary directory. + config_file_path: Optional override to the default configuration file + path, which is at `${HOME}/.tfdbg_config`. Raises: ValueError: If dump_root is an existing and non-empty directory or if @@ -127,9 +127,8 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession): self._is_run_start = True self._ui_type = ui_type self._config = None - if use_random_config_path: - self._config = cli_config.CLIConfig( - config_file_path=os.path.join(tempfile.mkdtemp(), ".tfdbg_config")) + if config_file_path: + self._config = cli_config.CLIConfig(config_file_path=config_file_path) def _is_disk_usage_reset_each_run(self): # The dumped tensors are all cleaned up after every Session.run From 1871ce3ced4c985c1fcba027ccc2737d960661c6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 10:39:54 -0700 Subject: [PATCH 0388/3053] Add dependencies on @com_google_absl//absl/base:log_severity to targets including "absl/base/log_severity.h" Bump the Abseil version so this target is available. PiperOrigin-RevId: 259565345 --- tensorflow/contrib/makefile/Makefile | 2 ++ tensorflow/core/BUILD | 7 +++++++ tensorflow/workspace.bzl | 8 ++++---- 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/tensorflow/contrib/makefile/Makefile b/tensorflow/contrib/makefile/Makefile index fa8dad938d7..b6e82cb1eed 100644 --- a/tensorflow/contrib/makefile/Makefile +++ b/tensorflow/contrib/makefile/Makefile @@ -133,6 +133,8 @@ $(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*benchmark*.cc) \ $(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*/*benchmark*.cc) \ $(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*/*/*benchmark*.cc) \ $(wildcard tensorflow/contrib/makefile/downloads/absl/absl/*/*/*/*/*benchmark*.cc) \ +$(wildcard tensorflow/contrib/makefile/downloads/absl/absl/random/*.cc) \ +$(wildcard tensorflow/contrib/makefile/downloads/absl/absl/random/internal/*.cc) \ tensorflow/contrib/makefile/downloads/absl/absl/synchronization/internal/mutex_nonprod.cc \ tensorflow/contrib/makefile/downloads/absl/absl/hash/internal/print_hash_of.cc diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 3b16fd92faa..89b9e2fb73f 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -363,6 +363,7 @@ cc_library( ":lib_platform", "//tensorflow/core/platform/default/build_config:base", "@com_google_absl//absl/base", + "@com_google_absl//absl/base:log_severity", "@com_google_absl//absl/strings", ], ) @@ -1190,6 +1191,7 @@ cc_library( [ "@nsync//:nsync_cpp", ] + [ + "@com_google_absl//absl/base:log_severity", "//third_party/eigen3", "//tensorflow/core/platform/default/build_config:minimal", ], @@ -2658,6 +2660,7 @@ cc_library( ":lib_internal", "//tensorflow/core/platform/default/build_config:png", "@com_google_absl//absl/base", + "@com_google_absl//absl/base:log_severity", "@com_google_absl//absl/strings", "@zlib_archive//:zlib", ], @@ -2679,6 +2682,7 @@ cc_library( deps = [ ":platform_base", "//tensorflow/core/platform/default/build_config:logging", + "@com_google_absl//absl/base:log_severity", ], ) @@ -2710,6 +2714,7 @@ cc_library( "//tensorflow/core/platform/default/build_config:jpeg", "//tensorflow/core/platform/default/build_config:logging", "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:log_severity", "@com_google_absl//absl/strings", ], ) @@ -2743,6 +2748,7 @@ cc_library( "//tensorflow/core/platform/default/build_config:gif", "//tensorflow/core/platform/default/build_config:logging", "@com_google_absl//absl/base:core_headers", + "@com_google_absl//absl/base:log_severity", "@com_google_absl//absl/strings", ], ) @@ -2770,6 +2776,7 @@ cc_library( linkopts = ["-ldl"], deps = [ "//tensorflow/core/platform/default/build_config:logging", + "@com_google_absl//absl/base:log_severity", "@com_google_absl//absl/strings", "@png_archive//:png", ], diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 1cfe0a2b689..8b7c32844b3 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -155,11 +155,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "com_google_absl", build_file = clean_dep("//third_party:com_google_absl.BUILD"), - sha256 = "acd93f6baaedc4414ebd08b33bebca7c7a46888916101d8c0b8083573526d070", - strip_prefix = "abseil-cpp-43ef2148c0936ebf7cb4be6b19927a9d9d145b8f", + sha256 = "eee7452846aae8040037234accf9a1cfbeca1d93bb4238b70f0d43d26645a602", + strip_prefix = "abseil-cpp-f3840bc5e33ce4932e35986cf3718450c6f02af2", urls = [ - "http://mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/43ef2148c0936ebf7cb4be6b19927a9d9d145b8f.tar.gz", - "https://github.com/abseil/abseil-cpp/archive/43ef2148c0936ebf7cb4be6b19927a9d9d145b8f.tar.gz", + "http://mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/f3840bc5e33ce4932e35986cf3718450c6f02af2.tar.gz", + "https://github.com/abseil/abseil-cpp/archive/f3840bc5e33ce4932e35986cf3718450c6f02af2.tar.gz", ], ) From 1e60d86d18ddd4464c9a2b57d72c19e9f187ba76 Mon Sep 17 00:00:00 2001 From: Raziel Alvarez Date: Tue, 23 Jul 2019 10:53:25 -0700 Subject: [PATCH 0389/3053] Updates docs with more details. PiperOrigin-RevId: 259568665 --- tensorflow/lite/schema/schema.fbs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs index b82bbdfd103..a26f22408c9 100644 --- a/tensorflow/lite/schema/schema.fbs +++ b/tensorflow/lite/schema/schema.fbs @@ -839,9 +839,13 @@ table Operator { // The list either has the same length as `inputs`, or is empty. mutating_variable_inputs:[bool]; - // Intermediate tensors record the tensor indices that are internal to an Op. - // Those tensors contains quantization information for complicated ops such as - // LSTM. + // A list of indices to the subgraph's "tensors" that are internal to an Op. + // Internal tensors are those that do not flow in or out of the operation, + // but instead are part of internal computation. As such, the operation's + // implementation may manage its memory more efficiently. They are needed + // however (i.e. not just an implementation detail) since they are part of the + // computation, which may require relevant metadata such as quantization + // parameters. intermediates:[int]; } From 9d38112059376412d8a8996bb4337958661fd3df Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 11:04:17 -0700 Subject: [PATCH 0390/3053] Improve error message of cloud tpu profiler. PiperOrigin-RevId: 259571236 --- .../core/profiler/rpc/client/capture_profile.cc | 11 +++++++---- .../python/tpu/profiler/capture_tpu_profile.py | 17 +++++++++++------ 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/tensorflow/core/profiler/rpc/client/capture_profile.cc b/tensorflow/core/profiler/rpc/client/capture_profile.cc index 7684c923117..842aa4a483b 100644 --- a/tensorflow/core/profiler/rpc/client/capture_profile.cc +++ b/tensorflow/core/profiler/rpc/client/capture_profile.cc @@ -80,6 +80,11 @@ ProfileRequest PopulateProfileRequest(int duration_ms, return request; } +bool ShouldRetryTracing(Status status) { + return status.code() == error::Code::UNAVAILABLE || + status.code() == error::Code::ALREADY_EXISTS; +} + // Returns whether the returned trace is empty. // Failure are handled by CHECK, i.e. abort() Status Profile(const string& service_addr, const string& logdir, @@ -215,16 +220,14 @@ Status StartTracing(const tensorflow::string& service_addr, status = NewSession(tpu_master, hostnames, duration_ms, repository_root, session_id, opts); } - if (remaining_attempts <= 0 || status.ok() || - status.code() != tensorflow::error::Code::UNAVAILABLE || - status.code() != tensorflow::error::Code::ALREADY_EXISTS) + if (remaining_attempts <= 0 || status.ok() || !ShouldRetryTracing(status)) break; std::cout << "No trace event is collected. Automatically retrying." << std::endl << std::endl; } - if (status.code() == tensorflow::error::Code::UNAVAILABLE) { + if (ShouldRetryTracing(status)) { std::cout << "No trace event is collected after " << num_tracing_attempts << " attempt(s). " << "Perhaps, you want to try again (with more attempts?)." diff --git a/tensorflow/python/tpu/profiler/capture_tpu_profile.py b/tensorflow/python/tpu/profiler/capture_tpu_profile.py index 6c201f78ada..53c29ab6aae 100644 --- a/tensorflow/python/tpu/profiler/capture_tpu_profile.py +++ b/tensorflow/python/tpu/profiler/capture_tpu_profile.py @@ -155,11 +155,16 @@ def main(unused_argv=None): '--tpu and using --service_addr.') service_addr = FLAGS.service_addr else: - tpu_cluster_resolver = ( - resolver.TPUClusterResolver([FLAGS.tpu], - zone=FLAGS.tpu_zone, - project=FLAGS.gcp_project)) - service_addr = tpu_cluster_resolver.get_master() + try: + tpu_cluster_resolver = ( + resolver.TPUClusterResolver([FLAGS.tpu], + zone=FLAGS.tpu_zone, + project=FLAGS.gcp_project)) + service_addr = tpu_cluster_resolver.get_master() + except (ValueError, TypeError): + sys.exit('Failed to find TPU %s in zone %s project %s. You may use ' + '--tpu_zone and --gcp_project to specify the zone and project of' + ' your TPU.' % (FLAGS.tpu, FLAGS.tpu_zone, FLAGS.gcp_project)) service_addr = service_addr.replace('grpc://', '').replace(':8470', ':8466') workers_list = '' @@ -180,7 +185,7 @@ def main(unused_argv=None): FLAGS.display_timestamp, FLAGS.num_queries) else: if not FLAGS.logdir: - sys.exit('logdir must be provided') + sys.exit('You must specify either --logdir or --monitoring_level.') try: profiler_client.start_tracing(service_addr, os.path.expanduser(FLAGS.logdir), From 5188d437349f203dc31dd5517ae81eaf8f29fde4 Mon Sep 17 00:00:00 2001 From: Jeremy Lau Date: Tue, 23 Jul 2019 11:09:59 -0700 Subject: [PATCH 0391/3053] Temporarily disable cross_device_ops_test. PiperOrigin-RevId: 259572543 --- tensorflow/python/distribute/BUILD | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD index 899e5c45de7..3eebc630dbe 100644 --- a/tensorflow/python/distribute/BUILD +++ b/tensorflow/python/distribute/BUILD @@ -756,7 +756,8 @@ cuda_py_test( "//tensorflow/python/eager:test", ], tags = [ - "multi_and_single_gpu", + # TODO(b/138143527): Re-enable after fixing Guitar failure. + # "multi_and_single_gpu", ], xla_enable_strict_auto_jit = True, ) From 90502d11a533a477cbf80253d7481a457dff8791 Mon Sep 17 00:00:00 2001 From: Tong Shen Date: Tue, 23 Jul 2019 11:29:18 -0700 Subject: [PATCH 0392/3053] Lift outside compilation only arguments from function call nodes. PiperOrigin-RevId: 259576649 --- .../jit/extract_outside_compilation_pass.cc | 113 ++++++++++++++++-- 1 file changed, 100 insertions(+), 13 deletions(-) diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc index 4be94666fc4..d9c106044d5 100644 --- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc +++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc @@ -781,6 +781,80 @@ Status PostprocessLiftedArgsForIf( return Status::OK(); } +Status PostprocessLiftedArgsForCall( + const std::unordered_map& outside_compilation_attr_to_node, + Graph* g, Node* n, FunctionLibraryDefinition* fld) { + const FunctionDef* fdef = fld->Find(n->type_string()); + TF_RET_CHECK(fdef); + + // Nothing to do if the function does not contain any lifted arguments. + if (!HasLiftedArgs(*fdef)) { + return Status::OK(); + } + + std::unique_ptr fbody; + TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(*fdef, n->attrs(), fld, &fbody)); + + int original_arg_count = fbody->arg_nodes.size(); + + TF_ASSIGN_OR_RETURN(auto lifted_arg_nodes_and_outside_compilation_nodes, + LiftedArgsAndOutsideCompilationNodesInFunctionBody( + *fbody, outside_compilation_attr_to_node)); + + // Append lifted args' types to call node's input data types. + std::vector data_types(n->input_types().begin(), + n->input_types().end()); + for (auto pair : lifted_arg_nodes_and_outside_compilation_nodes) { + Node* outside_compilation_node = pair.second; + DataType data_type; + TF_RET_CHECK(outside_compilation_node->IsIdentity() || + outside_compilation_node->type_string() == "Placeholder"); + if (outside_compilation_node->IsIdentity()) { + TF_RETURN_IF_ERROR( + GetNodeAttr(outside_compilation_node->def(), "T", &data_type)); + } else { + TF_RETURN_IF_ERROR( + GetNodeAttr(outside_compilation_node->def(), "dtype", &data_type)); + } + data_types.push_back(data_type); + } + + for (int i = original_arg_count; i < data_types.size(); ++i) { + TF_ASSIGN_OR_RETURN( + Node * arg_node, + AddOutsideCompilationInputArgToFunctionBody(*fbody, i, data_types[i])); + + ReplaceLiftedArgNodePlaceholderWithArg( + *fbody, original_arg_count, i, + lifted_arg_nodes_and_outside_compilation_nodes, arg_node); + } + + FunctionDef rewritten_fdef; + TF_RETURN_IF_ERROR(GraphToFunctionDef(*fbody->graph, n->type_string(), + HostGraphControlRetMapping, + &rewritten_fdef)); + TF_RETURN_IF_ERROR(fld->ReplaceFunction(n->type_string(), rewritten_fdef)); + + // We need to recreate the node. Otherwise TF will not know n->num_inputs() + // has increased. + NodeDef node_def = n->def(); + for (int i = original_arg_count; i < data_types.size(); i++) { + Node* outside_compilation_node = + lifted_arg_nodes_and_outside_compilation_nodes[i - original_arg_count] + .second; + node_def.add_input(absl::StrCat(outside_compilation_node->name(), ":", 0)); + } + TF_ASSIGN_OR_RETURN(n, ReplaceNode(g, n, node_def)); + + // Add edges from outside compilation nodes to call node. + AddEdgesFromOutsideCompilationNodes( + original_arg_count, + /*arg_to_input_edge_offset=*/0, data_types, + lifted_arg_nodes_and_outside_compilation_nodes, g, n); + + return Status::OK(); +} + // Creates a mapping from outside compilation cluster name to lifted argument // placeholder. xla::StatusOr> OutsideCompilationAttrToNode( @@ -806,6 +880,7 @@ Status PostprocessLiftedArgs(Graph* g, FunctionLibraryDefinition* fld) { TF_ASSIGN_OR_RETURN(auto outside_compilation_attr_to_node, OutsideCompilationAttrToNode(*g)); + std::vector call_nodes; for (Node* n : g->op_nodes()) { if (!HasNodeAttr(n->def(), kXlaHasHostTransferAttrName)) { continue; @@ -820,6 +895,19 @@ Status PostprocessLiftedArgs(Graph* g, FunctionLibraryDefinition* fld) { TF_RETURN_IF_ERROR(PostprocessLiftedArgsForIf( outside_compilation_attr_to_node, g, n, fld)); } + + // Outside compilation host side function call will always be direct + // function call nodes. + // Function call nodes need to be handled separately because we rewrite + // nodes in `PostprocessLiftedArgsForCall`. + if (fld->Contains(n->type_string())) { + call_nodes.push_back(n); + } + } + + for (Node* n : call_nodes) { + TF_RETURN_IF_ERROR(PostprocessLiftedArgsForCall( + outside_compilation_attr_to_node, g, n, fld)); } return Status::OK(); @@ -1646,17 +1734,8 @@ Status ExtractOutsideCompilationForNodesWithAssociatedFunctions( if_nodes.push_back(n); } else if (n->type_string() == "While") { while_nodes.push_back(n); - } else if (fld->Contains(n->type_string())) { + } else if (IsFunctionCall(*fld, *n)) { func_call_nodes.push_back(n); - } else if (n->type_string() == FunctionLibraryDefinition::kGradientOp) { - // Only gradient for user-defined function should be considered as - // function call node. - NameAttrList original_func; - TF_RETURN_IF_ERROR(GetNodeAttr( - n->def(), FunctionLibraryDefinition::kFuncAttr, &original_func)); - if (fld->Contains(original_func.name())) { - func_call_nodes.push_back(n); - } } } @@ -1664,9 +1743,17 @@ Status ExtractOutsideCompilationForNodesWithAssociatedFunctions( // Extract outside compilation for the function call. bool func_has_outside_compilation = false; NameAttrList func; - func.set_name(n->type_string()); - typedef protobuf::Map AttrMap; - *func.mutable_attr() = AttrMap(n->attrs().begin(), n->attrs().end()); + if (fld->Contains(n->type_string())) { + func.set_name(n->type_string()); + typedef protobuf::Map AttrMap; + *func.mutable_attr() = AttrMap(n->attrs().begin(), n->attrs().end()); + } else if (n->IsPartitionedCall()) { + TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "f", &func)); + } else { + TF_RET_CHECK(n->type_string() == FunctionLibraryDefinition::kGradientOp); + func.set_name(FunctionLibraryDefinition::kGradientOp); + *func.mutable_attr() = n->def().attr(); + } string new_func_name = absl::StrCat(n->name(), "_oc"); string host_func_name = absl::StrCat("oc_func_call_host_", n->name()); TF_RETURN_IF_ERROR(ExtractOutsideCompilationForFunction( From bc465849179292fa0b58a43c0d64180af13caacd Mon Sep 17 00:00:00 2001 From: Frank Chen Date: Tue, 23 Jul 2019 11:32:50 -0700 Subject: [PATCH 0393/3053] [tf.data] Handle control dependency loops in HashSubgraph PiperOrigin-RevId: 259577389 --- tensorflow/core/kernels/data/dataset_utils.cc | 232 ++++++++++++------ .../core/kernels/data/dataset_utils_test.cc | 157 ++++++++++++ 2 files changed, 319 insertions(+), 70 deletions(-) diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc index 9838586111d..53128e86b3e 100644 --- a/tensorflow/core/kernels/data/dataset_utils.cc +++ b/tensorflow/core/kernels/data/dataset_utils.cc @@ -246,68 +246,14 @@ Status VerifyShapesCompatible(const std::vector& expected, namespace { -uint64 HashAttr(const FunctionDefLibrary& library, const string& attr_key, - const AttrValue& attr_value) { - uint64 attr_hash = 0; - if (attr_value.has_func()) { - for (const auto& func : library.function()) { - if (func.signature().name() == attr_value.func().name()) { - attr_hash = Hash64CombineUnordered( - attr_hash, - Hash64(absl::StrCat(attr_key, "=", - HashSubgraphFunction(library, &func)))); - break; - } - } - } else { - attr_hash = Hash64CombineUnordered( - attr_hash, Hash64(absl::StrCat(attr_key, "=", - DeterministicProtoHash64(attr_value)))); - } - - return attr_hash; +uint64 DefaultDependencyLoopNodeHash() { + static const uint64 hash = Hash64("DependencyLoopNode"); + return hash; } -uint64 HashSubgraph(const grappler::GraphView& g, const NodeDef* node) { - uint64 input_hash = 0; - uint64 control_dep_hash = 0; - - for (int i = 0; i < node->input_size(); ++i) { - DCHECK_GT(node->input(i).length(), 0); - if (node->input(i)[0] == '^') { - // TODO(frankchn): Investigate if control dependencies are necessary - // inputs to the hash. - // Control dependency node names start with '^', and order of appearance - // for the control dependencies does not matter. - control_dep_hash = Hash64CombineUnordered( - control_dep_hash, - HashSubgraph(g, g.GetNode(node->input(i).substr(1)))); - } else { - // The output port is significant and is optionally delimited by a ':' - // for non-zero ports. - std::pair node_spec = - absl::StrSplit(node->input(i), absl::MaxSplits(':', 1)); - // TODO(frankchn): Cache hashes if possible. - uint64 child_node_hash = HashSubgraph(g, g.GetNode(node_spec.first)); - uint64 child_port_hash = Hash64(node_spec.second); - input_hash = Hash64Combine( - input_hash, Hash64Combine(child_node_hash, child_port_hash)); - } - } - - uint64 op_hash = Hash64(node->op()); - - uint64 attr_hash = 0; - for (const auto& attr : node->attr()) { - attr_hash = Hash64CombineUnordered( - attr_hash, HashAttr(g.graph()->library(), attr.first, attr.second)); - } - - uint64 device_hash = Hash64(node->device()); - - return Hash64Combine( - Hash64Combine(attr_hash, op_hash), - Hash64Combine(device_hash, Hash64Combine(input_hash, control_dep_hash))); +uint64 DefaultDependencyLoopFnHash() { + static const uint64 hash = Hash64("DependencyLoopFn"); + return hash; } void ClearOpDefForHashing(OpDef* op) { @@ -324,18 +270,144 @@ void ClearOpDefForHashing(OpDef* op) { } } -} // namespace +// forward declaration for use in HashAttr. +uint64 HashSubgraphFunctionImpl( + const FunctionDefLibrary& library, const FunctionDef* f, + std::vector* visited, + absl::flat_hash_map* cache); + +// Produces a hash of a attribute from an op or a function. Since attributes +// may refer to functions present in the graph, we may need to hash the function +// referred to by the attribute, and thus we need the FunctionDefLibrary. +uint64 HashAttr(const FunctionDefLibrary& library, const std::string& attr_key, + const AttrValue& attr_value, std::vector* visited, + absl::flat_hash_map* cache) { + uint64 attr_hash = 0; + if (attr_value.has_func()) { + for (const auto& func : library.function()) { + if (func.signature().name() == attr_value.func().name()) { + attr_hash = Hash64CombineUnordered( + attr_hash, + Hash64(absl::StrCat( + attr_key, "=", + HashSubgraphFunctionImpl(library, &func, visited, cache)))); + break; + } + } + } else { + attr_hash = Hash64CombineUnordered( + attr_hash, Hash64(absl::StrCat(attr_key, "=", + DeterministicProtoHash64(attr_value)))); + } + + return attr_hash; +} + +// This function hashes a subgraph (rooted at node) by traversing all possible +// dependency paths from that node. +uint64 HashSubgraphImpl(const grappler::GraphView& g, const NodeDef* node, + std::vector* visited, + absl::flat_hash_map* cache) { + uint64 input_hash = 0; + uint64 control_dep_hash = 0; + + std::string canonical_node_name = absl::StrCat("node-", node->name()); + auto it = cache->find(canonical_node_name); + if (it != cache->end()) { + return it->second; + } + + uint64 op_hash = Hash64(node->op()); + + // Checks to make sure we won't get stuck in an infinite loop (especially in + // loops with control dependencies). + for (const std::string& visited_node_name : *visited) { + if (visited_node_name == canonical_node_name) { + uint64 final_hash = + Hash64Combine(DefaultDependencyLoopNodeHash(), op_hash); + (*cache)[canonical_node_name] = final_hash; + return final_hash; + } + } + visited->push_back(canonical_node_name); + + for (int i = 0; i < node->input_size(); ++i) { + DCHECK_GT(node->input(i).length(), 0); + if (node->input(i)[0] == '^') { + // TODO(frankchn): Investigate if control dependencies are necessary + // inputs to the hash. + // Control dependency node names start with '^', and order of appearance + // for the control dependencies does not matter. + control_dep_hash = Hash64CombineUnordered( + control_dep_hash, + HashSubgraphImpl(g, g.GetNode(node->input(i).substr(1)), visited, + cache)); + } else { + // The output port is significant and is optionally delimited by a ':' + // for non-zero ports. + std::pair node_spec = + absl::StrSplit(node->input(i), absl::MaxSplits(':', 1)); + uint64 child_node_hash = + HashSubgraphImpl(g, g.GetNode(node_spec.first), visited, cache); + uint64 child_port_hash = Hash64(node_spec.second); + input_hash = Hash64Combine( + input_hash, Hash64Combine(child_node_hash, child_port_hash)); + } + } + + uint64 attr_hash = 0; + for (const auto& attr : node->attr()) { + attr_hash = Hash64CombineUnordered( + attr_hash, HashAttr(g.graph()->library(), attr.first, attr.second, + visited, cache)); + } + + uint64 device_hash = Hash64(node->device()); + + uint64 final_hash = Hash64Combine( + Hash64Combine(attr_hash, op_hash), + Hash64Combine(device_hash, Hash64Combine(input_hash, control_dep_hash))); + + (*cache)[canonical_node_name] = final_hash; + visited->pop_back(); + + return final_hash; +} + +// This function hashes a function by traversing all possible dependency paths +// from all output nodes declared by the function in its definition. +uint64 HashSubgraphFunctionImpl( + const FunctionDefLibrary& library, const FunctionDef* f, + std::vector* visited, + absl::flat_hash_map* cache) { + std::string canonical_function_name = + absl::StrCat("function-", f->signature().name()); + + auto it = cache->find(canonical_function_name); + if (it != cache->end()) { + return it->second; + } -uint64 HashSubgraphFunction(const FunctionDefLibrary& library, - const FunctionDef* f) { OpDef op = f->signature(); ClearOpDefForHashing(&op); uint64 signature_hash = OpDefHash(op); + // Checks to make sure we won't get stuck in an infinite loop (especially when + // functions depend on other function ops as a control dependency). + for (const std::string& visited_node_name : *visited) { + if (visited_node_name == canonical_function_name) { + uint64 final_hash = + Hash64Combine(DefaultDependencyLoopFnHash(), signature_hash); + (*cache)[canonical_function_name] = final_hash; + return final_hash; + } + } + visited->push_back(canonical_function_name); + uint64 attr_hash = 0; for (const auto& attr : f->attr()) { attr_hash = Hash64CombineUnordered( - attr_hash, HashAttr(library, attr.first, attr.second)); + attr_hash, HashAttr(library, attr.first, attr.second, visited, cache)); } uint64 arg_attr_hash = 0; @@ -343,8 +415,8 @@ uint64 HashSubgraphFunction(const FunctionDefLibrary& library, for (const auto& attr : arg_attr.second.attr()) { arg_attr_hash = Hash64CombineUnordered( arg_attr_hash, - Hash64Combine(arg_attr.first, - HashAttr(library, attr.first, attr.second))); + Hash64Combine(arg_attr.first, HashAttr(library, attr.first, + attr.second, visited, cache))); } } @@ -359,6 +431,8 @@ uint64 HashSubgraphFunction(const FunctionDefLibrary& library, node_graph_node->set_name(input_arg.name()); node_graph_node->set_op("_Retval"); } + *(node_graph.mutable_library()) = library; + grappler::GraphView node_gv(&node_graph); // TODO(frankchn): Investigate whether we need to hash the name of the @@ -371,7 +445,8 @@ uint64 HashSubgraphFunction(const FunctionDefLibrary& library, // For every return value, we need to hash the output node (and the subgraph // rooted at the output node) to ensure that the computation graph that // ends at the output node has not changed. - uint64 node_hash = HashSubgraph(node_gv, node_gv.GetNode(node_spec.first)); + uint64 node_hash = HashSubgraphImpl( + node_gv, node_gv.GetNode(node_spec.first), visited, cache); uint64 node_port_hash = Hash64(node_spec.second); ret_hash = Hash64CombineUnordered( @@ -383,7 +458,9 @@ uint64 HashSubgraphFunction(const FunctionDefLibrary& library, for (const auto& ret : f->control_ret()) { std::pair node_spec = absl::StrSplit(ret.second, absl::MaxSplits(':', 1)); - uint64 node_hash = HashSubgraph(node_gv, node_gv.GetNode(node_spec.first)); + + uint64 node_hash = HashSubgraphImpl( + node_gv, node_gv.GetNode(node_spec.first), visited, cache); uint64 node_port_hash = Hash64(node_spec.second); control_ret_hash = Hash64CombineUnordered( @@ -392,13 +469,28 @@ uint64 HashSubgraphFunction(const FunctionDefLibrary& library, Hash64Combine(node_hash, node_port_hash))); } - return Hash64Combine( + uint64 final_hash = Hash64Combine( Hash64Combine(Hash64Combine(signature_hash, attr_hash), arg_attr_hash), Hash64Combine(ret_hash, control_ret_hash)); + (*cache)[canonical_function_name] = final_hash; + visited->pop_back(); + + return final_hash; +} + +} // namespace + +uint64 HashSubgraphFunction(const FunctionDefLibrary& library, + const FunctionDef* f) { + std::vector visited; + absl::flat_hash_map cache; + return HashSubgraphFunctionImpl(library, f, &visited, &cache); } uint64 HashSubgraph(const GraphDef& g, const NodeDef* node) { - return HashSubgraph(grappler::GraphView(&g), node); + std::vector visited; + absl::flat_hash_map cache; + return HashSubgraphImpl(grappler::GraphView(&g), node, &visited, &cache); } namespace { diff --git a/tensorflow/core/kernels/data/dataset_utils_test.cc b/tensorflow/core/kernels/data/dataset_utils_test.cc index 98e958b5f59..f2fe5888ed0 100644 --- a/tensorflow/core/kernels/data/dataset_utils_test.cc +++ b/tensorflow/core/kernels/data/dataset_utils_test.cc @@ -546,6 +546,163 @@ TEST(DatasetUtilsTest, HashSubgraphDifferentGraphSamePartialGraph) { EXPECT_EQ(hash1, hash2); } +TEST(DatasetUtilsTest, HashSubgraphWithManyControlDependencies) { + GraphDef gd; + NodeDef* n; + + for (int i = 0; i < 1000; ++i) { + n = gd.add_node(); + NodeDefBuilder ndb(absl::StrCat("graph_1/node_", i), "Const"); + ndb.Attr("value", 1); + ndb.Device("CPU:0"); + for (int j = 0; j < i; ++j) { + ndb.ControlInput(absl::StrCat("graph_1/node_", j)); + } + TF_CHECK_OK(ndb.Finalize(n)); + } + + // No checks here, because so long as this does not time out, we are OK. + HashSubgraph(gd, n); +} + +TEST(DatasetUtilsTest, HashSubgraphFunctionsWithControlDependencyLoop) { + GraphDef gd; + + FunctionDefLibrary* fl1 = gd.mutable_library(); + FunctionDef* f1 = fl1->add_function(); + + AttrValue a1; + NameAttrList* nal1 = a1.mutable_func(); + nal1->set_name("AddAndMul"); + + std::pair func_attr = { + "body", FunctionDefHelper::AttrValueWrapper(*nal1)}; + + FunctionDef func = FunctionDefHelper::Create( + /*function_name=*/"AddAndMul", + /*in_def=*/{"i: float"}, + /*out_def=*/{"o: float"}, + /*attr_def=*/{}, + /*node_def=*/ + {{{"add"}, "Add", {"i", "i"}, {{"T", DT_FLOAT}}, {"ret"}}, + // This creates a dependency on the same function. + {{"for"}, "For", {"i", "i", "i"}, {func_attr}, {"ret"}}, + {{"ret"}, "Mul", {"i", "i"}, {{"T", DT_FLOAT}}}}, + /*ret_def=*/{{"o", "for:z:0"}}, + /*control_ret_def=*/{{"must_execute", "add"}}); + *f1 = func; + + NodeDef* n1 = gd.add_node(); + TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const") + .Attr("value", 1) + .Device("CPU:0") + .Finalize(n1)); + + std::vector func_inputs; + func_inputs.emplace_back(n1->name(), 0, DT_FLOAT); + func_inputs.emplace_back(n1->name(), 0, DT_FLOAT); + + NodeDef* n2 = gd.add_node(); + TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "For") + .Input(n1->name(), 0, DT_INT32) + .Input(n1->name(), 0, DT_INT32) + .Input(n1->name(), 0, DT_INT32) + .Input(func_inputs) + .ControlInput("graph_1/node_2") + .Attr("body", a1) + .Device("CPU:0") + .Finalize(n2)); + + // No checks in the test, the fact that it runs and doesn't timeout or exhaust + // the stack means it is successful. + HashSubgraph(gd, n2); +} + +TEST(DatasetUtilsTest, HashSubgraphWithControlDependencyLoop) { + GraphDef gd; + + NodeDef* n1 = gd.add_node(); + TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const") + .Attr("value", 1) + .Device("CPU:0") + .ControlInput("graph_1/node_2") + .Finalize(n1)); + + NodeDef* n2 = gd.add_node(); + TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const") + .Attr("value", 2) + .Device("CPU:0") + .ControlInput("graph_1/node_1") + .Finalize(n2)); + + NodeDef* n3 = gd.add_node(); + TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add") + .Device("CPU:0") + .Input(n1->name(), 0, DT_INT32) + .Input(n2->name(), 0, DT_INT32) + .ControlInput("graph_1/node_1") + .ControlInput("graph_1/node_2") + .Finalize(n3)); + + // No checks in the test, the fact that it runs and doesn't timeout or exhaust + // the stack means it is successful. + HashSubgraph(gd, n3); +} + +TEST(DatasetUtilsTest, HashSubgraphWithControlDependencyLoopDifferentNames) { + GraphDef gd1; + + NodeDef* n1 = gd1.add_node(); + TF_CHECK_OK(NodeDefBuilder("graph_1/node_1", "Const") + .Attr("value", 1) + .Device("CPU:0") + .ControlInput("graph_1/node_2") + .Finalize(n1)); + + NodeDef* n2 = gd1.add_node(); + TF_CHECK_OK(NodeDefBuilder("graph_1/node_2", "Const") + .Attr("value", 2) + .Device("CPU:0") + .ControlInput("graph_1/node_1") + .Finalize(n2)); + + NodeDef* n3 = gd1.add_node(); + TF_CHECK_OK(NodeDefBuilder("graph_1/node_3", "Add") + .Device("CPU:0") + .Input(n1->name(), 0, DT_INT32) + .Input(n2->name(), 0, DT_INT32) + .ControlInput("graph_1/node_1") + .ControlInput("graph_1/node_2") + .Finalize(n3)); + + GraphDef gd2; + + NodeDef* n4 = gd2.add_node(); + TF_CHECK_OK(NodeDefBuilder("graph_1/node_4", "Const") + .Attr("value", 1) + .Device("CPU:0") + .ControlInput("graph_1/node_5") + .Finalize(n4)); + + NodeDef* n5 = gd2.add_node(); + TF_CHECK_OK(NodeDefBuilder("graph_1/node_5", "Const") + .Attr("value", 2) + .Device("CPU:0") + .ControlInput("graph_1/node_4") + .Finalize(n5)); + + NodeDef* n6 = gd2.add_node(); + TF_CHECK_OK(NodeDefBuilder("graph_1/node_6", "Add") + .Device("CPU:0") + .Input(n4->name(), 0, DT_INT32) + .Input(n5->name(), 0, DT_INT32) + .ControlInput("graph_1/node_4") + .ControlInput("graph_1/node_5") + .Finalize(n6)); + + EXPECT_EQ(HashSubgraph(gd1, n3), HashSubgraph(gd2, n6)); +} + TEST(DatasetUtilsTest, AddToFunctionLibrary) { auto make_fn_a = [](const string& fn_name) { return FunctionDefHelper::Create( From 1baa1bb06586bdfdc9f985bb757d20fce7213560 Mon Sep 17 00:00:00 2001 From: Yu-Cheng Ling Date: Tue, 23 Jul 2019 11:37:18 -0700 Subject: [PATCH 0394/3053] Fix Toco Flex tests for newly added ops PiperOrigin-RevId: 259578274 --- tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc index 4ade603ce2f..2459bd157f6 100644 --- a/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc +++ b/tensorflow/lite/delegates/flex/whitelisted_flex_ops.cc @@ -182,7 +182,9 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) { "LRN", "MatMul", "MatrixDiag", + "MatrixDiagV2", "MatrixSetDiag", + "MatrixSetDiagV2", "Max", "Maximum", "MaxPool", From 23fca97574bda6333d87ed716f5e49e82bf47e07 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 11:48:50 -0700 Subject: [PATCH 0395/3053] Tweak quantization-aware training re-writer to support NasFpn model architecture. PiperOrigin-RevId: 259580475 --- tensorflow/contrib/quantize/python/quantize.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py index f61e28bbc7e..a90647deed0 100644 --- a/tensorflow/contrib/quantize/python/quantize.py +++ b/tensorflow/contrib/quantize/python/quantize.py @@ -39,7 +39,8 @@ _RELU_TYPES = {'Relu', 'Relu6'} _QUANTIZATION_OP = {'FakeQuantWithMinMaxVars'} _VALID_SRC_OP = {'Add', 'AddV2', 'Mul'} _INTERMEDIATE_OP = {'Add', 'AddV2', 'Mul'} -_PASS_THROUGH_OP = {'Reshape', 'Identity', 'BatchToSpaceND', 'SpaceToBatchND'} +_PASS_THROUGH_OP = {'Reshape', 'Identity', 'BatchToSpaceND', 'SpaceToBatchND', + 'MaxPool', 'Max'} _VALID_ACTIVATION_OP = {'Relu', 'Relu6'} From 8fda6dd70398d77a9f8bc3a02d28bd0c0d37d52a Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 12:17:01 -0700 Subject: [PATCH 0396/3053] Update Sequential to allow single layers to be passed in init. PiperOrigin-RevId: 259585797 --- tensorflow/python/keras/engine/sequential.py | 2 ++ tensorflow/python/keras/engine/sequential_test.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py index a83b6ad6d83..07638bebb7a 100644 --- a/tensorflow/python/keras/engine/sequential.py +++ b/tensorflow/python/keras/engine/sequential.py @@ -106,6 +106,8 @@ class Sequential(training.Model): # Add to the model any layers passed to the constructor. if layers: + if not isinstance(layers, (list, tuple)): + layers = [layers] tf_utils.assert_no_legacy_layers(layers) for layer in layers: self.add(layer) diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py index babb37d6c37..e06a8953fbd 100644 --- a/tensorflow/python/keras/engine/sequential_test.py +++ b/tensorflow/python/keras/engine/sequential_test.py @@ -60,6 +60,11 @@ class TestSequential(keras_parameterized.TestCase): self.assertLen(model.weights, 2 * 2) self.assertEqual(model.get_layer(name='dp').name, 'dp') + @keras_parameterized.run_all_keras_modes + def test_single_layer_in_init(self): + model = keras.models.Sequential(keras.layers.Dense(1)) + self.assertLen(model.layers, 1) + @keras_parameterized.run_all_keras_modes def test_sequential_pop(self): num_hidden = 5 From 5f56298955baa492cb600685e4d2f0c3ab936ee7 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Tue, 23 Jul 2019 13:20:00 -0700 Subject: [PATCH 0397/3053] Handle serialized Tensors in MLIR conversion This make tensor that are using the "tensor_content" field for their serialization using DenseElementAttr instead of an Opaque Tensor in MLIR. Not only this enables constant folding, but the conversion itself is also much faster. PiperOrigin-RevId: 259597673 --- .../tests/graphdef2mlir/const-values.pbtxt | 90 +++++++++++++++++++ .../graph-11c8752c150e5643.pbtxt | 2 +- .../mlir/tensorflow/utils/convert_tensor.cc | 36 +++++++- 3 files changed, 125 insertions(+), 3 deletions(-) create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/const-values.pbtxt diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/const-values.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/const-values.pbtxt new file mode 100644 index 00000000000..019deaf4df4 --- /dev/null +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/const-values.pbtxt @@ -0,0 +1,90 @@ +# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s + +node { + name: "x" + op: "Const" + device: "/device:CPU:0" + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_FLOAT + tensor_shape { + dim { + size: 2 + } + dim { + size: 3 + } + } + tensor_content: "\x00\x00\x80\x3F\x00\x00\x00\x40\x00\x00\x40\x40\x00\x00\x80\x40\x00\x00\xA0\x40\x00\x00\xC0\x40" + # CHECK: value = dense<{{\[\[}}1.000000e+00, 2.000000e+00, 3.000000e+00], [4.000000e+00, 5.000000e+00, 6.000000e+00]]> : tensor<2x3xf32> + } + } + } +} +node { + name: "y" + op: "Const" + device: "/device:CPU:0" + attr { + key: "dtype" + value { + type: DT_INT64 + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_INT64 + tensor_shape { + dim { + size: 2 + } + dim { + size: 3 + } + } + tensor_content: "\x01\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00\x00\x00\x00\x00" + # CHECK: value = dense<{{\[\[}}1, 3, 2], [5, 4, 7]]> : tensor<2x3xi64> + } + } + } +} +node { + name: "z" + op: "Const" + device: "/device:CPU:0" + attr { + key: "dtype" + value { + type: DT_INT32 + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_INT32 + tensor_shape { + dim { + size: 2 + } + dim { + size: 3 + } + } + tensor_content: "\x01\x00\x00\x00\x03\x00\x00\x00\x02\x00\x00\x00\x05\x00\x00\x00\x04\x00\x00\x00\x07\x00\x00\x00" + # CHECK: value = dense<{{\[\[}}1, 3, 2], [5, 4, 7]]> : tensor<2x3xi32> + } + } + } +} + diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-11c8752c150e5643.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-11c8752c150e5643.pbtxt index ae252ef83dd..b2dd870d66b 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-11c8752c150e5643.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-11c8752c150e5643.pbtxt @@ -92,7 +92,7 @@ versions { } # CHECK: func @main() { -# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "/job:localhost/replica:0/task:0/device:TPU:0", dtype = "tfdtype$DT_INT32", name = "Empty/shape", value = opaque<"tf", "0x746674656E736F722464747970653A2044545F494E5433320A74656E736F725F7368617065207B0A202064696D207B0A2020202073697A653A20320A20207D0A7D0A74656E736F725F636F6E74656E743A20225C3230305C3030305C3030305C3030305C3230305C3030305C3030305C303030220A"> : tensor<2xi32>} : () -> (tensor<2xi32>, !_tf.control) +# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "/job:localhost/replica:0/task:0/device:TPU:0", dtype = "tfdtype$DT_INT32", name = "Empty/shape", value = dense<128> : tensor<2xi32>} : () -> (tensor<2xi32>, !_tf.control) # CHECK-NEXT: %1:2 = "_tf.Empty"(%0#0) {device = "/job:localhost/replica:0/task:0/device:TPU:0", dtype = "tfdtype$DT_BFLOAT16", init = false, name = "Empty"} : (tensor<2xi32>) -> (tensor<128x128xbf16>, !_tf.control) # CHECK-NEXT: %2 = "_tf._Send"(%1#0) {T = "tfdtype$DT_BFLOAT16", client_terminated = false, device = "/job:localhost/replica:0/task:0/device:TPU:0", name = "Empty/_0", recv_device = "/job:localhost/replica:0/task:0/device:CPU:0", send_device = "/job:localhost/replica:0/task:0/device:TPU:0", send_device_incarnation = 1 : i64, tensor_name = "edge_5_Empty"} : (tensor<128x128xbf16>) -> !_tf.control # CHECK-NEXT: return diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc index f66b07b246a..e872ab3f1fb 100644 --- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc +++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc @@ -127,6 +127,28 @@ ConvertToDenseElementsAttr( return mlir::DenseElementsAttr::get(type, llvm::makeArrayRef(buff)); } +// Convert a TensorFlow tensor from its raw serialization into a +// DenseElementAttr. This is a wrapper around mlir::DenseElementsAttr that +// creates a temporary copy of the data for satisfying strict aliasing +// defensively. TODO(aminim): this extra copy should not be needed, +// DenseElementAttr will perform a similar copy internally. +// Template parameter `T` must match the element type of the `type` argument +// (this is checked in DenseElementsAttr::get()). +template +mlir::DenseElementsAttr ConvertToDenseElementsAttr(const absl::Cord& values, + ShapedType type, + Builder* builder) { + DCHECK_EQ((values.size() % sizeof(T)), 0) + << "unexpected size vs elt type mismatch"; + int n_elements = values.size() / sizeof(T); + auto data = absl::make_unique(n_elements); + // This assumes that the endianess conversion was handled when loading the + // tensor in memory. + values.CopyToArray(reinterpret_cast(data.get())); + return mlir::DenseElementsAttr::get( + type, llvm::makeArrayRef(data.get(), n_elements)); +} + // Converts an TensorFlow tensor proto with DT_FLOAT data type into an MLIR // elements attribute. StatusOr ConvertFloatTensor(const TensorProto& input_tensor, @@ -141,6 +163,9 @@ StatusOr ConvertFloatTensor(const TensorProto& input_tensor, return ConvertToDenseElementsAttr(input_tensor.float_val(), type, builder); } + auto raw_data = input_tensor.tensor_content(); + if (raw_data.size() == type.getSizeInBits() / 8) + return ConvertToDenseElementsAttr(raw_data, type, builder); return ConvertToOpaqueElementsAttr(input_tensor, type, builder); } @@ -156,9 +181,13 @@ StatusOr ConvertIntTensor(const TensorProto& input_tensor, // set. auto repeated_val_size = input_tensor.int_val_size(); if (repeated_val_size == 1 || repeated_val_size == type.getNumElements()) { - return ConvertToDenseElementsAttr(input_tensor.int_val(), type, - builder); + return ConvertToDenseElementsAttr(input_tensor.int_val(), type, + builder); } + auto raw_data = input_tensor.tensor_content(); + if (raw_data.size() == type.getSizeInBits() / 8) + return ConvertToDenseElementsAttr(raw_data, type, builder); + return ConvertToOpaqueElementsAttr(input_tensor, type, builder); } @@ -177,6 +206,9 @@ StatusOr ConvertInt64Tensor(const TensorProto& input_tensor, uint64_t>(input_tensor.int64_val(), type, builder); } + auto raw_data = input_tensor.tensor_content(); + if (raw_data.size() == type.getSizeInBits() / 8) + return ConvertToDenseElementsAttr(raw_data, type, builder); return ConvertToOpaqueElementsAttr(input_tensor, type, builder); } From 5c7b8ea9b23a60efbbe20d3bb3b679a6be381924 Mon Sep 17 00:00:00 2001 From: Reed Wanderman-Milne Date: Tue, 23 Jul 2019 13:35:51 -0700 Subject: [PATCH 0398/3053] Remove old incompatible versions of FusedBatchNorm ops from ops_history.*.pbtxt. I removed these for the same reason as in 4c7e2edfea75200d2e4c20e32c73a8a7fb7f764b. In that commit, I only removed incompatible versions of FusedBatchNormGradV2, because I didn't realize the issue affected the other versions of FusedBatchNorm. Here, I remove it for all versions of FusedBatchNorm. PiperOrigin-RevId: 259600636 --- .../core/ops/compat/ops_history.v1.pbtxt | 313 ------------------ .../core/ops/compat/ops_history.v2.pbtxt | 313 ------------------ 2 files changed, 626 deletions(-) diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index 8d901ce7e03..bbcb06f32ee 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -29491,79 +29491,6 @@ op { } } } -op { - name: "FusedBatchNorm" - input_arg { - name: "x" - type_attr: "T" - } - input_arg { - name: "scale" - type_attr: "T" - } - input_arg { - name: "offset" - type_attr: "T" - } - input_arg { - name: "mean" - type_attr: "T" - } - input_arg { - name: "variance" - type_attr: "T" - } - output_arg { - name: "y" - type_attr: "T" - } - output_arg { - name: "batch_mean" - type_attr: "T" - } - output_arg { - name: "batch_variance" - type_attr: "T" - } - output_arg { - name: "reserve_space_1" - type_attr: "T" - } - output_arg { - name: "reserve_space_2" - type_attr: "T" - } - attr { - name: "T" - type: "type" - allowed_values { - list { - type: DT_FLOAT - } - } - } - attr { - name: "epsilon" - type: "float" - default_value { - f: 0.0001 - } - } - attr { - name: "data_format" - type: "string" - default_value { - s: "NHWC" - } - } - attr { - name: "is_training" - type: "bool" - default_value { - b: true - } - } -} op { name: "FusedBatchNorm" input_arg { @@ -29643,79 +29570,6 @@ op { } } } -op { - name: "FusedBatchNormGrad" - input_arg { - name: "y_backprop" - type_attr: "T" - } - input_arg { - name: "x" - type_attr: "T" - } - input_arg { - name: "scale" - type_attr: "T" - } - input_arg { - name: "reserve_space_1" - type_attr: "T" - } - input_arg { - name: "reserve_space_2" - type_attr: "T" - } - output_arg { - name: "x_backprop" - type_attr: "T" - } - output_arg { - name: "scale_backprop" - type_attr: "T" - } - output_arg { - name: "offset_backprop" - type_attr: "T" - } - output_arg { - name: "reserve_space_3" - type_attr: "T" - } - output_arg { - name: "reserve_space_4" - type_attr: "T" - } - attr { - name: "T" - type: "type" - allowed_values { - list { - type: DT_FLOAT - } - } - } - attr { - name: "epsilon" - type: "float" - default_value { - f: 0.0001 - } - } - attr { - name: "data_format" - type: "string" - default_value { - s: "NHWC" - } - } - attr { - name: "is_training" - type: "bool" - default_value { - b: true - } - } -} op { name: "FusedBatchNormGrad" input_arg { @@ -29979,173 +29833,6 @@ op { } } } -op { - name: "FusedBatchNormV2" - input_arg { - name: "x" - type_attr: "T" - } - input_arg { - name: "scale" - type_attr: "U" - } - input_arg { - name: "offset" - type_attr: "U" - } - input_arg { - name: "mean" - type_attr: "U" - } - input_arg { - name: "variance" - type_attr: "U" - } - output_arg { - name: "y" - type_attr: "T" - } - output_arg { - name: "batch_mean" - type_attr: "U" - } - output_arg { - name: "batch_variance" - type_attr: "U" - } - output_arg { - name: "reserve_space_1" - type_attr: "U" - } - output_arg { - name: "reserve_space_2" - type_attr: "U" - } - attr { - name: "T" - type: "type" - allowed_values { - list { - type: DT_HALF - type: DT_FLOAT - } - } - } - attr { - name: "U" - type: "type" - allowed_values { - list { - type: DT_FLOAT - } - } - } - attr { - name: "epsilon" - type: "float" - default_value { - f: 0.0001 - } - } - attr { - name: "data_format" - type: "string" - default_value { - s: "NHWC" - } - } - attr { - name: "is_training" - type: "bool" - default_value { - b: true - } - } -} -op { - name: "FusedBatchNormV2" - input_arg { - name: "x" - type_attr: "T" - } - input_arg { - name: "scale" - type_attr: "U" - } - input_arg { - name: "offset" - type_attr: "U" - } - input_arg { - name: "mean" - type_attr: "U" - } - input_arg { - name: "variance" - type_attr: "U" - } - output_arg { - name: "y" - type_attr: "T" - } - output_arg { - name: "batch_mean" - type_attr: "U" - } - output_arg { - name: "batch_variance" - type_attr: "U" - } - output_arg { - name: "reserve_space_1" - type_attr: "U" - } - output_arg { - name: "reserve_space_2" - type_attr: "U" - } - attr { - name: "T" - type: "type" - allowed_values { - list { - type: DT_HALF - type: DT_BFLOAT16 - type: DT_FLOAT - } - } - } - attr { - name: "U" - type: "type" - allowed_values { - list { - type: DT_FLOAT - } - } - } - attr { - name: "epsilon" - type: "float" - default_value { - f: 0.0001 - } - } - attr { - name: "data_format" - type: "string" - default_value { - s: "NHWC" - } - } - attr { - name: "is_training" - type: "bool" - default_value { - b: true - } - } -} op { name: "FusedBatchNormV2" input_arg { diff --git a/tensorflow/core/ops/compat/ops_history.v2.pbtxt b/tensorflow/core/ops/compat/ops_history.v2.pbtxt index 8bfe6ad275f..2851585889f 100644 --- a/tensorflow/core/ops/compat/ops_history.v2.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v2.pbtxt @@ -27142,79 +27142,6 @@ op { } } } -op { - name: "FusedBatchNorm" - input_arg { - name: "x" - type_attr: "T" - } - input_arg { - name: "scale" - type_attr: "T" - } - input_arg { - name: "offset" - type_attr: "T" - } - input_arg { - name: "mean" - type_attr: "T" - } - input_arg { - name: "variance" - type_attr: "T" - } - output_arg { - name: "y" - type_attr: "T" - } - output_arg { - name: "batch_mean" - type_attr: "T" - } - output_arg { - name: "batch_variance" - type_attr: "T" - } - output_arg { - name: "reserve_space_1" - type_attr: "T" - } - output_arg { - name: "reserve_space_2" - type_attr: "T" - } - attr { - name: "T" - type: "type" - allowed_values { - list { - type: DT_FLOAT - } - } - } - attr { - name: "epsilon" - type: "float" - default_value { - f: 0.0001 - } - } - attr { - name: "data_format" - type: "string" - default_value { - s: "NHWC" - } - } - attr { - name: "is_training" - type: "bool" - default_value { - b: true - } - } -} op { name: "FusedBatchNorm" input_arg { @@ -27294,79 +27221,6 @@ op { } } } -op { - name: "FusedBatchNormGrad" - input_arg { - name: "y_backprop" - type_attr: "T" - } - input_arg { - name: "x" - type_attr: "T" - } - input_arg { - name: "scale" - type_attr: "T" - } - input_arg { - name: "reserve_space_1" - type_attr: "T" - } - input_arg { - name: "reserve_space_2" - type_attr: "T" - } - output_arg { - name: "x_backprop" - type_attr: "T" - } - output_arg { - name: "scale_backprop" - type_attr: "T" - } - output_arg { - name: "offset_backprop" - type_attr: "T" - } - output_arg { - name: "reserve_space_3" - type_attr: "T" - } - output_arg { - name: "reserve_space_4" - type_attr: "T" - } - attr { - name: "T" - type: "type" - allowed_values { - list { - type: DT_FLOAT - } - } - } - attr { - name: "epsilon" - type: "float" - default_value { - f: 0.0001 - } - } - attr { - name: "data_format" - type: "string" - default_value { - s: "NHWC" - } - } - attr { - name: "is_training" - type: "bool" - default_value { - b: true - } - } -} op { name: "FusedBatchNormGrad" input_arg { @@ -27536,173 +27390,6 @@ op { } } } -op { - name: "FusedBatchNormV2" - input_arg { - name: "x" - type_attr: "T" - } - input_arg { - name: "scale" - type_attr: "U" - } - input_arg { - name: "offset" - type_attr: "U" - } - input_arg { - name: "mean" - type_attr: "U" - } - input_arg { - name: "variance" - type_attr: "U" - } - output_arg { - name: "y" - type_attr: "T" - } - output_arg { - name: "batch_mean" - type_attr: "U" - } - output_arg { - name: "batch_variance" - type_attr: "U" - } - output_arg { - name: "reserve_space_1" - type_attr: "U" - } - output_arg { - name: "reserve_space_2" - type_attr: "U" - } - attr { - name: "T" - type: "type" - allowed_values { - list { - type: DT_HALF - type: DT_FLOAT - } - } - } - attr { - name: "U" - type: "type" - allowed_values { - list { - type: DT_FLOAT - } - } - } - attr { - name: "epsilon" - type: "float" - default_value { - f: 0.0001 - } - } - attr { - name: "data_format" - type: "string" - default_value { - s: "NHWC" - } - } - attr { - name: "is_training" - type: "bool" - default_value { - b: true - } - } -} -op { - name: "FusedBatchNormV2" - input_arg { - name: "x" - type_attr: "T" - } - input_arg { - name: "scale" - type_attr: "U" - } - input_arg { - name: "offset" - type_attr: "U" - } - input_arg { - name: "mean" - type_attr: "U" - } - input_arg { - name: "variance" - type_attr: "U" - } - output_arg { - name: "y" - type_attr: "T" - } - output_arg { - name: "batch_mean" - type_attr: "U" - } - output_arg { - name: "batch_variance" - type_attr: "U" - } - output_arg { - name: "reserve_space_1" - type_attr: "U" - } - output_arg { - name: "reserve_space_2" - type_attr: "U" - } - attr { - name: "T" - type: "type" - allowed_values { - list { - type: DT_HALF - type: DT_BFLOAT16 - type: DT_FLOAT - } - } - } - attr { - name: "U" - type: "type" - allowed_values { - list { - type: DT_FLOAT - } - } - } - attr { - name: "epsilon" - type: "float" - default_value { - f: 0.0001 - } - } - attr { - name: "data_format" - type: "string" - default_value { - s: "NHWC" - } - } - attr { - name: "is_training" - type: "bool" - default_value { - b: true - } - } -} op { name: "FusedBatchNormV2" input_arg { From 23d0d87c37120b40259c2e066a7960778041d53f Mon Sep 17 00:00:00 2001 From: Jeremy Lau Date: Tue, 23 Jul 2019 13:38:36 -0700 Subject: [PATCH 0399/3053] Temporarily disable lite_mlir_test. PiperOrigin-RevId: 259601307 --- tensorflow/lite/python/BUILD | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD index 9316da8e94c..db0edd96aa0 100644 --- a/tensorflow/lite/python/BUILD +++ b/tensorflow/lite/python/BUILD @@ -143,6 +143,8 @@ py_test( tags = [ "no_oss", "no_windows", + # TODO(b/138223396) Re-enable after fixing compatibility horizon issue. + "notap", ], deps = [ ":lite", From 80f4fa58575585713adfcad185d559539dd98f75 Mon Sep 17 00:00:00 2001 From: Allen Lavoie Date: Tue, 23 Jul 2019 13:39:03 -0700 Subject: [PATCH 0400/3053] Fix forwardprop_test flakes (again) One was dominating the critical path. I suspect this test is spending almost all of its time graph building... Parameterizing means each parameterization can run on a different shard. PiperOrigin-RevId: 259601429 --- tensorflow/python/eager/forwardprop_test.py | 23 ++++++++++++++------- tensorflow/python/framework/test_util.py | 6 +++--- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/tensorflow/python/eager/forwardprop_test.py b/tensorflow/python/eager/forwardprop_test.py index 0272ba15a7f..ffc688a9c83 100644 --- a/tensorflow/python/eager/forwardprop_test.py +++ b/tensorflow/python/eager/forwardprop_test.py @@ -20,6 +20,7 @@ from __future__ import print_function import functools import weakref +from absl.testing import parameterized import numpy as np from tensorflow.python import pywrap_tensorflow @@ -38,6 +39,13 @@ from tensorflow.python.platform import test from tensorflow.python.util import nest +_X11_35_DERIVATIVES = [ + 1.1 ** 3.5, + 3.5 * 1.1 ** 2.5, + 3.5 * 2.5 * 1.1 ** 1.5, + 3.5 * 2.5 * 1.5 * 1.1 ** 0.5] + + # TODO(allenl): Move this somewhere useful once forward gradients are stable. def _jvp(f, primals, tangents): """Compute the jacobian of `f` at `primals` multiplied by `tangents`.""" @@ -120,7 +128,7 @@ def _test_gradients(testcase, testcase.assertAllClose(sym_jac_back, sym_jac_fwd) -class ForwardpropTest(test.TestCase): +class ForwardpropTest(test.TestCase, parameterized.TestCase): def testForwardGradientFunction(self): add_outputs = (constant_op.constant(4.),) @@ -250,8 +258,11 @@ class ForwardpropTest(test.TestCase): _test_gradients(self, f, [constant_op.constant([1.])], order=3) + @parameterized.named_parameters( + [("Order{}".format(order), order, expected) + for order, expected in enumerate(_X11_35_DERIVATIVES)]) @test_util.assert_no_new_pyobjects_executing_eagerly - def testHigherOrderPureForward(self): + def testHigherOrderPureForward(self, order, expected): def _forwardgrad(f): def _compute_forwardgrad(primal): @@ -267,13 +278,9 @@ class ForwardpropTest(test.TestCase): f = _forward primal = constant_op.constant(1.1) - for expected in [1.1 ** 3.5, - 3.5 * 1.1 ** 2.5, - 3.5 * 2.5 * 1.1 ** 1.5, - 3.5 * 2.5 * 1.5 * 1.1 ** 0.5, - 3.5 * 2.5 * 1.5 * 0.5 * 1.1 ** -0.5]: - self.assertAllClose(expected, f(primal)) + for _ in range(order): f = _forwardgrad(f) + self.assertAllClose(expected, f(primal)) def testFunctionGradPureForward(self): diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py index a1adf18bf35..4eaae126cef 100644 --- a/tensorflow/python/framework/test_util.py +++ b/tensorflow/python/framework/test_util.py @@ -547,7 +547,7 @@ def assert_no_new_pyobjects_executing_eagerly(f): a bit of Python. """ - def decorator(self, **kwargs): + def decorator(self, *args, **kwargs): """Warms up, gets an object count, runs the test, checks for new objects.""" with context.eager_mode(): gc.disable() @@ -558,7 +558,7 @@ def assert_no_new_pyobjects_executing_eagerly(f): # tests that fail with 1 warmup run, and pass with 2, on various versions # of python2.7.x. for _ in range(2): - f(self, **kwargs) + f(self, *args, **kwargs) gc.collect() previous_count = len(gc.get_objects()) if ops.has_default_graph(): @@ -567,7 +567,7 @@ def assert_no_new_pyobjects_executing_eagerly(f): for collection in ops.get_default_graph().collections } for _ in range(3): - f(self, **kwargs) + f(self, *args, **kwargs) # Note that gc.get_objects misses anything that isn't subject to garbage # collection (C types). Collections are a common source of leaks, so we # test for collection sizes explicitly. From 6f5cf40f9acea38c1cac04100dc9b0acf8855cad Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 13:49:07 -0700 Subject: [PATCH 0401/3053] Fixes training_v2 path learning phase issue during validation, and adds tests. PiperOrigin-RevId: 259603576 --- .../python/keras/engine/training_eager.py | 17 +-- .../python/keras/engine/training_test.py | 126 +++++++++++++++++- tensorflow/python/keras/engine/training_v2.py | 8 +- .../python/keras/engine/training_v2_utils.py | 4 +- 4 files changed, 135 insertions(+), 20 deletions(-) diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py index 2619af0adc2..2c182391273 100644 --- a/tensorflow/python/keras/engine/training_eager.py +++ b/tensorflow/python/keras/engine/training_eager.py @@ -346,14 +346,15 @@ def test_on_batch(model, training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val)) if val is not None else None for val in sample_weights ] - outs, total_loss, output_losses, masks = ( - _model_loss( - model, - inputs, - targets, - sample_weights=sample_weights, - training=False, - output_loss_metrics=output_loss_metrics)) + with backend.eager_learning_phase_scope(0): + outs, total_loss, output_losses, masks = ( + _model_loss( + model, + inputs, + targets, + sample_weights=sample_weights, + training=False, + output_loss_metrics=output_loss_metrics)) if not isinstance(outs, list): outs = [outs] metrics_results = _eager_metrics_fn( diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py index 874de6baace..151a3532945 100644 --- a/tensorflow/python/keras/engine/training_test.py +++ b/tensorflow/python/keras/engine/training_test.py @@ -257,10 +257,128 @@ class TrainingTest(keras_parameterized.TestCase): return inputs + array_ops.constant([0], 'float32') model = keras.Sequential([ReturnTraining()]) - model.compile('sgd', 'mse') + model.compile( + 'sgd', + 'mse', + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) hist = model.fit(x=np.array([0.]), y=np.array([0.])) self.assertAllClose(hist.history['loss'][0], 10000) + @keras_parameterized.run_all_keras_modes + def test_fit_and_validate_learning_phase(self): + + class ReturnTraining(keras.layers.Layer): + + def call(self, inputs): + return keras.backend.in_train_phase( + lambda: array_ops.ones_like(inputs), + lambda: array_ops.zeros_like(inputs)) + + model = keras.Sequential([ReturnTraining(input_shape=(2,))]) + model.compile( + 'sgd', + loss='mae', + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) + + inputs = np.ones((40, 2), dtype=np.float32) + targets = np.ones((40, 1), dtype=np.float32) + + # Test correctness with `steps_per_epoch`. + train_dataset = dataset_ops.Dataset.from_tensor_slices( + (inputs, targets)).batch(10) + val_dataset = dataset_ops.Dataset.from_tensor_slices( + (inputs, targets)).batch(10) + history = model.fit( + train_dataset, epochs=2, verbose=1, validation_data=val_dataset) + + # The training loss should be 0.0 + self.assertAllClose(history.history['loss'][0], 0.0) + # The validation loss should be 1.0. + self.assertAllClose(history.history['val_loss'][0], 1.0) + + @keras_parameterized.run_all_keras_modes + def test_fit_and_validate_training_arg(self): + + class ReturnTraining(keras.layers.Layer): + + def call(self, inputs, training=None): + return keras.backend.in_train_phase( + lambda: array_ops.ones_like(inputs), + lambda: array_ops.zeros_like(inputs), + training=training) + + model = keras.Sequential([ReturnTraining(input_shape=(2,))]) + model.compile( + 'sgd', + loss='mae', + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) + + inputs = np.ones((40, 2), dtype=np.float32) + targets = np.ones((40, 1), dtype=np.float32) + + # Test correctness with `steps_per_epoch`. + train_dataset = dataset_ops.Dataset.from_tensor_slices( + (inputs, targets)).batch(10) + val_dataset = dataset_ops.Dataset.from_tensor_slices( + (inputs, targets)).batch(10) + history = model.fit( + train_dataset, epochs=2, verbose=1, validation_data=val_dataset) + + # The training loss should be 0.0 + self.assertAllClose(history.history['loss'][0], 0.0) + # The validation loss should be 1.0. + self.assertAllClose(history.history['val_loss'][0], 1.0) + + @keras_parameterized.run_all_keras_modes + def test_fit_and_validate_nested_training_arg(self): + + class NestedReturnTraining(keras.layers.Layer): + + def call(self, inputs, training=None): + return keras.backend.in_train_phase( + lambda: array_ops.ones_like(inputs), + lambda: array_ops.zeros_like(inputs), + training=training) + + class ReturnTraining(keras.layers.Layer): + + def __init__(self, input_shape=None, **kwargs): + super(ReturnTraining, self).__init__(input_shape=input_shape, **kwargs) + self._nested_layer = None + + def build(self, input_shape): + self._nested_layer = NestedReturnTraining() + self.built = True + + def call(self, inputs): + return self._nested_layer(inputs) + + model = keras.Sequential([ReturnTraining(input_shape=(2,))]) + model.compile( + 'sgd', + loss='mae', + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) + + inputs = np.ones((40, 2), dtype=np.float32) + targets = np.ones((40, 1), dtype=np.float32) + + # Test correctness with `steps_per_epoch`. + train_dataset = dataset_ops.Dataset.from_tensor_slices( + (inputs, targets)).batch(10) + val_dataset = dataset_ops.Dataset.from_tensor_slices( + (inputs, targets)).batch(10) + history = model.fit( + train_dataset, epochs=2, verbose=1, validation_data=val_dataset) + + # The training loss should be 0.0 + self.assertAllClose(history.history['loss'][0], 0.0) + # The validation loss should be 1.0. + self.assertAllClose(history.history['val_loss'][0], 1.0) + @keras_parameterized.run_with_all_model_types(exclude_models='sequential') @keras_parameterized.run_all_keras_modes def test_fit_on_arrays(self): @@ -1259,9 +1377,6 @@ class TrainingTest(keras_parameterized.TestCase): @keras_parameterized.run_all_keras_modes(always_skip_v1=True) def test_subclassed_model_with_training_arg(self): - if testing_utils.should_run_distributed(): - self.skipTest('b/137397816') - class LayerWithTrainingArg(keras.layers.Layer): def call(self, inputs, training=None): @@ -1288,7 +1403,8 @@ class TrainingTest(keras_parameterized.TestCase): run_distributed=testing_utils.should_run_distributed()) model.fit(x, x, epochs=1) - if testing_utils.should_run_eagerly(): + if (testing_utils.should_run_eagerly() or + testing_utils.should_run_distributed()): expected_training_arg = True else: expected_training_arg = keras.backend.symbolic_learning_phase() diff --git a/tensorflow/python/keras/engine/training_v2.py b/tensorflow/python/keras/engine/training_v2.py index dd07a94bae2..6e789ccd73c 100644 --- a/tensorflow/python/keras/engine/training_v2.py +++ b/tensorflow/python/keras/engine/training_v2.py @@ -182,9 +182,7 @@ class Loop(training_utils.TrainingLoop): dist_utils.validate_callbacks(input_callbacks=callbacks, optimizer=model.optimizer) # Enter tf.distribute.Strategy scope. - with dist_utils.distributed_scope( - strategy=strategy, learning_phase=1): - + with strategy.scope(): training_data_adapter, validation_adapter = _process_training_inputs( model, x, @@ -336,9 +334,7 @@ class Loop(training_utils.TrainingLoop): dist_utils.validate_callbacks(input_callbacks=callbacks, optimizer=model.optimizer) # Enter tf.distribute.Strategy scope. - with dist_utils.distributed_scope( - strategy=strategy, learning_phase=0): - + with strategy.scope(): adapter = _process_inputs( model, x, diff --git a/tensorflow/python/keras/engine/training_v2_utils.py b/tensorflow/python/keras/engine/training_v2_utils.py index e609559e5e8..ec898493a25 100644 --- a/tensorflow/python/keras/engine/training_v2_utils.py +++ b/tensorflow/python/keras/engine/training_v2_utils.py @@ -29,6 +29,7 @@ import functools from tensorflow.python.distribute import distribution_strategy_context from tensorflow.python.eager import def_function from tensorflow.python.framework import tensor_util +from tensorflow.python.keras import backend from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils from tensorflow.python.keras.engine import training_eager from tensorflow.python.keras.engine import training_utils @@ -332,4 +333,5 @@ def predict_on_batch(model, x): if len(inputs) == 1: inputs = inputs[0] - return model(inputs) # pylint: disable=not-callable + with backend.eager_learning_phase_scope(0): + return model(inputs) # pylint: disable=not-callable From b2c81572dbf4759fe875bb316669b0f9b031158c Mon Sep 17 00:00:00 2001 From: Guozhong Zhuang Date: Tue, 23 Jul 2019 14:14:12 -0700 Subject: [PATCH 0402/3053] clean MKL ML to address mergeconflict --- tensorflow/core/util/mkl_util.h | 689 ++------------------------------ 1 file changed, 24 insertions(+), 665 deletions(-) diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h index 6deb785238c..166da34da02 100644 --- a/tensorflow/core/util/mkl_util.h +++ b/tensorflow/core/util/mkl_util.h @@ -24,32 +24,13 @@ limitations under the License. #include #include -#if defined(INTEL_MKL_ML_ONLY) || defined(INTEL_MKL_DNN_ONLY) -#ifndef INTEL_MKL -#error "INTEL_MKL_{ML,DNN}_ONLY require INTEL_MKL" -#endif -#endif - -#if defined(INTEL_MKL_ML_ONLY) && defined(INTEL_MKL_DNN_ONLY) -#error "at most one of INTEL_MKL_ML_ONLY and INTEL_MKL_DNN_ONLY may be defined" -#endif - -#ifdef INTEL_MKL_ML_ONLY -#error "Please use INTEL MKL DNN (the default option for --config=mkl)." -#endif - -#ifdef INTEL_MKL_ML_ONLY -#include "mkl_dnn.h" -#include "mkl_dnn_types.h" -#include "mkl_service.h" -#include "mkl_trans.h" -#endif - +#include "mkldnn.hpp" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/graph/mkl_graph_util.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/core/stringpiece.h" #include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/cpu_info.h" #include "tensorflow/core/platform/logging.h" @@ -58,16 +39,11 @@ limitations under the License. #include "tensorflow/core/util/padding.h" #include "tensorflow/core/util/tensor_format.h" -#ifndef INTEL_MKL_ML_ONLY -#include "mkldnn.hpp" -#include "tensorflow/core/lib/core/stringpiece.h" - using mkldnn::engine; using mkldnn::memory; using mkldnn::padding_kind; using mkldnn::primitive; using mkldnn::reorder; -#endif #ifdef _WIN32 typedef unsigned int uint; @@ -83,9 +59,6 @@ namespace tensorflow { // MKL operation, and did not go through a conversion to a standard // Tensorflow tensor. -// For use with MKL ML, has been deprecated -typedef enum { W = 0, H = 1, C = 2, N = 3 } MklDims; - // The dimensions order that MKL-DNN internally uses for 2D activations // [Batch, Channel, Height, Width] and // for 2D filters [Out_Channel, In_Channel, Height, Width]. @@ -140,7 +113,7 @@ typedef enum { MKL_GROUP_FILTER_DIM_W = 4 } MklDnnFilterGroupDims; -// Enum used to templatize MklOp kernel implementations +// Enum used to templatize MklOp kernel implementation // that support both fp32 and int8 versions. enum class MklQuantization { QUANTIZED_VERSION, @@ -149,269 +122,6 @@ enum class MklQuantization { static const int kSmallBatchSize = 32; -#ifdef INTEL_MKL_ML_ONLY -class MklShape { - public: - MklShape() {} - TF_DISALLOW_COPY_AND_ASSIGN(MklShape); // Cannot copy - - ~MklShape() { - if (sizes_) delete[] sizes_; - if (strides_) delete[] strides_; - if (mklLayout_) CHECK_EQ(dnnLayoutDelete_F32(mklLayout_), E_SUCCESS); - if (tfLayout_) CHECK_EQ(dnnLayoutDelete_F32(tfLayout_), E_SUCCESS); - if (tf_to_mkl_dim_map_) delete[] tf_to_mkl_dim_map_; - } - - const bool IsMklTensor() const { return isMklTensor_; } - - void SetMklTensor(const bool isMklTensor) { isMklTensor_ = isMklTensor; } - - void SetDimensions(const size_t dimension) { dimension_ = dimension; } - - void SetMklLayout(dnnLayout_t mklLayout) { mklLayout_ = mklLayout; } - - void SetMklLayout(const void* primitive, size_t resourceType) { - CHECK_EQ( - dnnLayoutCreateFromPrimitive_F32(&mklLayout_, (dnnPrimitive_t)primitive, - (dnnResourceType_t)resourceType), - E_SUCCESS); - } - - void SetTfLayout(const size_t dimension, const size_t* sizes, - const size_t* strides) { - dimension_ = dimension; - if (dimension > 0) { // MKl doesn't support zero dimension tensors - sizes_ = new size_t[dimension]; - strides_ = new size_t[dimension]; - - for (int ii = 0; ii < dimension; ii++) { - sizes_[ii] = sizes[ii]; - strides_[ii] = strides[ii]; - } - CHECK_EQ(dnnLayoutCreate_F32(&tfLayout_, dimension, sizes, strides), - E_SUCCESS); - } - } - - // Default case - MKL dim ordering is opposite of TF dim ordering - // MKL -> (DIMS-1)...0 where (DIMS-1) is outermost dim and 0 is innermost dim - // TF -> 0...(DIMS-1) where 0 is outermost dim and (DIMS-1) is innermost dim - // For layers that rely on data_format semantics (conv, pooling etc.) - // or operate only on certain dimensions (relu, concat, split etc.), - // Mkl APIs might require us to reorder these dimensions. In such cases, - // kernels should explicitly set this map - void SetTfDimOrder(const size_t dimension) { - CHECK(dimension == dimension_); - if (tf_to_mkl_dim_map_ == nullptr) { - tf_to_mkl_dim_map_ = new size_t[dimension]; - } - for (size_t ii = 0; ii < dimension; ii++) { - tf_to_mkl_dim_map_[ii] = dimension - (ii + 1); - } - } - - void SetTfDimOrder(const size_t dimension, const size_t* tf_to_mkl_dim_map) { - CHECK(dimension == dimension_); - if (tf_to_mkl_dim_map_ == nullptr) { - tf_to_mkl_dim_map_ = new size_t[dimension]; - } - for (size_t ii = 0; ii < dimension; ii++) { - tf_to_mkl_dim_map_[ii] = tf_to_mkl_dim_map[ii]; - } - } - - void SetTfDimOrder(const size_t dimension, TensorFormat data_format) { - CHECK_EQ(dimension, 4); - CHECK(dimension == dimension_); - if (tf_to_mkl_dim_map_ == nullptr) { - tf_to_mkl_dim_map_ = new size_t[dimension]; - } - tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'W')] = MklDims::W; - tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'H')] = MklDims::H; - tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'C')] = MklDims::C; - tf_to_mkl_dim_map_[GetTensorDimIndex<2>(data_format, 'N')] = MklDims::N; - } - - const dnnLayout_t GetMklLayout() const { return mklLayout_; } - const dnnLayout_t GetTfLayout() const { return tfLayout_; } - const dnnLayout_t GetCurLayout() const { - return isMklTensor_ ? mklLayout_ : tfLayout_; - } - size_t GetDimension() const { return dimension_; } - const size_t* GetSizes() const { return sizes_; } - int64 dim_size(int index) const { return sizes_[index]; } - int64 tf_dim_size(int index) const { - return sizes_[tf_to_mkl_dim_map_[index]]; - } - const size_t* GetStrides() const { return strides_; } - const size_t* GetTfToMklDimMap() const { return tf_to_mkl_dim_map_; } - size_t tf_dim_idx(int index) const { return tf_to_mkl_dim_map_[index]; } - - // Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd' - // corresponds to MKL's Channel dimension. - bool IsMklChannelDim(int d) const { return tf_dim_idx(d) == MklDims::C; } - // Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd' - // corresponds to MKL's Batch dimension. - bool IsMklBatchDim(int d) const { return tf_dim_idx(d) == MklDims::N; } - // Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd' - // corresponds to MKL's Width dimension. - bool IsMklWidthDim(int d) const { return tf_dim_idx(d) == MklDims::W; } - // Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd' - // corresponds to MKL's Height dimension. - bool IsMklHeightDim(int d) const { return tf_dim_idx(d) == MklDims::H; } - - // Check if the TF-Mkl dimension ordering map specifies if the input - // tensor is in NCHW format. - bool IsTensorInNCHWFormat() const { - TensorFormat data_format = FORMAT_NCHW; - return (IsMklBatchDim(GetTensorDimIndex<2>(data_format, 'N')) && - IsMklChannelDim(GetTensorDimIndex<2>(data_format, 'C')) && - IsMklHeightDim(GetTensorDimIndex<2>(data_format, 'H')) && - IsMklWidthDim(GetTensorDimIndex<2>(data_format, 'W'))); - } - - // Check if the TF-Mkl dimension ordering map specifies if the input - // tensor is in NHWC format. - bool IsTensorInNHWCFormat() const { - TensorFormat data_format = FORMAT_NHWC; - return (IsMklBatchDim(GetTensorDimIndex<2>(data_format, 'N')) && - IsMklChannelDim(GetTensorDimIndex<2>(data_format, 'C')) && - IsMklHeightDim(GetTensorDimIndex<2>(data_format, 'H')) && - IsMklWidthDim(GetTensorDimIndex<2>(data_format, 'W'))); - } - - void GetConvertedFlatData(dnnLayout_t targetLayout, void* input, - void* output) const { - dnnLayout_t curLayout; - if (isMklTensor_) - curLayout = mklLayout_; - else - curLayout = tfLayout_; - dnnPrimitive_t convert; - CHECK_EQ(dnnConversionCreate_F32(&convert, curLayout, targetLayout), - E_SUCCESS); - CHECK_EQ(dnnConversionExecute_F32(convert, input, output), E_SUCCESS); - CHECK_EQ(dnnDelete_F32(convert), E_SUCCESS); - } - -// The following methods are used for serializing and de-serializing the -// contents of the mklshape object. -// The data is serialized in this order -// isMklTensor_ -// dimension_ -// sizes_ -// strides_ -// mklLayout_ -// tfLayout_ -// tf_to_mkl_dim_map_ - -#define SIZE_OF_MKL_DNN_BUF \ - (dnnLayoutSerializationBufferSize_F32()) // Size of buffer needed to - // serialize dnn_layout pointer - -// Size of buffer to hold the serialized object, the size is computed as -// follows sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes_) + -// sizeof(strides_) -// + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer) -// + sizeof(tf_to_mkl_dim_map_) - -#define SIZE_OF_MKL_SERIAL_DATA(dims) \ - (2 * sizeof(size_t) + 3 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF) - -// First we need to define some macro for offsets into the serial buffer where -// different elements of Mklshape is written/read from - -#define IS_MKL_TENSOR_OFFSET 0 -// Location from start of buffer where isMklTensor_ is serialized -#define DIMS_OFFSET \ - (IS_MKL_TENSOR_OFFSET + sizeof(size_t)) // Location of dimension_ -// Location of sizes. Note dim is not used here, left here -// to make macros consistent. -#define SIZES_OFFSET(dims) (DIMS_OFFSET + sizeof(size_t)) -#define STRIDES_OFFSET(dims) \ - (SIZES_OFFSET(dims) + dims * sizeof(size_t)) // Location of strides -#define MKL_LAYOUT_OFFSET(dims) \ - (STRIDES_OFFSET(dims) + dims * sizeof(size_t)) // Location of mklLayout_ -#define TF_LAYOUT_OFFSET(dims) \ - (MKL_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF) // Location of tfLayout_ -// Location of tf_to_mkl_dim_map_ -#define TF_TO_MKL_DIM_MAP_OFFSET(dims) \ - (TF_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF) - - // TODO(agramesh1) make sure to create a const to share with rewrite pass - // for min size of MKL metadata tensor. - - void DeSerializeMklShape(const unsigned char* buf, size_t buf_size) { - CHECK(buf_size >= sizeof(size_t)) << "Bufsize too small in DeSerialize"; - // Make sure buffer holds at least isMklTensor_ - isMklTensor_ = - *reinterpret_cast(buf + IS_MKL_TENSOR_OFFSET) != 0; - - if (isMklTensor_) { // If it is an MKL Tensor then read the rest - dimension_ = *(reinterpret_cast(buf + DIMS_OFFSET)); - CHECK(buf_size >= SIZE_OF_MKL_SERIAL_DATA(dimension_)) - << "Bufsize too small in DeSerialize"; - sizes_ = new size_t[dimension_]; - strides_ = new size_t[dimension_]; - tf_to_mkl_dim_map_ = new size_t[dimension_]; - for (int i = 0; i < dimension_; i++) { - sizes_[i] = - reinterpret_cast(buf + SIZES_OFFSET(dimension_))[i]; - strides_[i] = reinterpret_cast( - buf + STRIDES_OFFSET(dimension_))[i]; - tf_to_mkl_dim_map_[i] = reinterpret_cast( - buf + TF_TO_MKL_DIM_MAP_OFFSET(dimension_))[i]; - } - CHECK_EQ(dnnLayoutDeserialize_F32(&mklLayout_, - buf + MKL_LAYOUT_OFFSET(dimension_)), - E_SUCCESS); - CHECK_EQ(dnnLayoutDeserialize_F32(&tfLayout_, - buf + TF_LAYOUT_OFFSET(dimension_)), - E_SUCCESS); - } - } - - void SerializeMklShape(unsigned char* buf, size_t buf_size) const { - CHECK(buf_size >= SIZE_OF_MKL_SERIAL_DATA(dimension_)) - << "Bufsize too small to Serialize"; - *reinterpret_cast(buf + IS_MKL_TENSOR_OFFSET) = - isMklTensor_ ? 1 : 0; - if (isMklTensor_) { - *(reinterpret_cast(buf + DIMS_OFFSET)) = dimension_; - for (int i = 0; i < dimension_; i++) { - reinterpret_cast(buf + SIZES_OFFSET(dimension_))[i] = - sizes_[i]; - reinterpret_cast(buf + STRIDES_OFFSET(dimension_))[i] = - strides_[i]; - reinterpret_cast(buf + - TF_TO_MKL_DIM_MAP_OFFSET(dimension_))[i] = - tf_to_mkl_dim_map_[i]; - } - CHECK_EQ(dnnLayoutSerialize_F32(mklLayout_, - buf + MKL_LAYOUT_OFFSET(dimension_)), - E_SUCCESS); - CHECK_EQ( - dnnLayoutSerialize_F32(tfLayout_, buf + TF_LAYOUT_OFFSET(dimension_)), - E_SUCCESS); - } - } - - private: - bool isMklTensor_ = - false; // Flag to indicate if the tensor is an MKL tensor or not - dnnLayout_t mklLayout_ = nullptr; // Pointer to the MKL layout - dnnLayout_t tfLayout_ = nullptr; // Pointer to layout of corresponding - // Tensorflow tensor, used when conversion from MKL to standard tensor - size_t dimension_ = 0; - size_t* sizes_ = nullptr; // Required by MKL for conversions - size_t* strides_ = nullptr; // Required by MKL for conversions - size_t* tf_to_mkl_dim_map_ = - nullptr; // TF dimension corresponding to this MKL dimension -}; - -#else - // Forward decl TensorFormat MklDnn3DDataFormatToTFDataFormat(memory::format format); TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format); @@ -681,8 +391,7 @@ class MklDnnShape { return IsMklTensor() ? GetMklLayout() : GetTfLayout(); } - // nhasabni - I've removed SetTfDimOrder that was setting default order in - // case of MKL-ML. We don't need a case of default dimension order because + // We don't need a case of default dimension order because // when an operator that does not get data_format attribute gets all inputs // in Tensorflow format, it will produce output in Tensorflow format. inline void SetTfDimOrder(const size_t dimension, const mkldnn_dims_t map) { @@ -731,11 +440,13 @@ class MklDnnShape { inline bool IsMklChannelDim(int d) const { return TfDimIdx(d) == MklDnnDims::Dim_C; } + /// Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd' /// corresponds to MKL's Batch dimension. inline bool IsMklBatchDim(int d) const { return TfDimIdx(d) == MklDnnDims::Dim_N; } + /// Query TF-MKL dimension ordering map and check if Tensorflow dimension 'd' /// corresponds to MKL's Width dimension. inline bool IsMklWidthDim(int d) const { @@ -796,52 +507,9 @@ class MklDnnShape { } }; -#endif - // List of MklShape objects. Used in Concat/Split layers. - -#ifndef INTEL_MKL_ML_ONLY typedef std::vector MklDnnShapeList; -#else -typedef std::vector MklShapeList; -#endif -#ifdef INTEL_MKL_ML_ONLY -// Check if all tensors specified by MklShapes are MKL tensors. -inline bool AreAllMklTensors(const MklShapeList& shapes) { - for (auto& s : shapes) { - if (!s.IsMklTensor()) { - return false; - } - } - return true; -} - -template -inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor, - const MklShape& mkl_shape) { - Tensor output_tensor; - TensorShape output_shape; - - for (size_t j = 0; j < mkl_shape.GetDimension(); j++) { - // Outermost to innermost dimension - output_shape.AddDim(mkl_shape.GetSizes()[mkl_shape.tf_dim_idx(j)]); - } - - // Allocate output tensor. - context->allocate_temp(DataTypeToEnum::v(), output_shape, &output_tensor); - - dnnLayout_t output_layout = static_cast(mkl_shape.GetTfLayout()); - void* input_buffer = const_cast(mkl_tensor.flat().data()); - void* output_buffer = const_cast(output_tensor.flat().data()); - - if (mkl_tensor.NumElements() != 0) { - mkl_shape.GetConvertedFlatData(output_layout, input_buffer, output_buffer); - } - - return output_tensor; -} -#else using mkldnn::stream; template class MklDnnData; @@ -857,8 +525,8 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor, TensorShape output_shape = mkl_shape.GetTfShape(); // Allocate output tensor. - context->allocate_temp(DataTypeToEnum::v(), output_shape, - &output_tensor); + TF_CHECK_OK(context->allocate_temp(DataTypeToEnum::v(), output_shape, + &output_tensor)); auto cpu_engine = engine(engine::cpu, 0); MklDnnData input(&cpu_engine); @@ -887,21 +555,8 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor, } return output_tensor; } -#endif // Get the MKL shape from the second string tensor -#ifdef INTEL_MKL_ML_ONLY -inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) { - mklshape->DeSerializeMklShape( - ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs())) - .flat() - .data(), - ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs())) - .flat() - .size() * - sizeof(uint8)); -} -#else inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape, bool eager_mode = false) { if (!eager_mode) { @@ -917,7 +572,6 @@ inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape, mklshape->SetMklTensor(false); } } -#endif // Gets the actual input inline const Tensor& MklGetInput(OpKernelContext* ctext, int n) { @@ -927,25 +581,9 @@ inline const Tensor& MklGetInput(OpKernelContext* ctext, int n) { inline void GetMklInputList(OpKernelContext* ctext, StringPiece name, OpInputList* input_tensors) { CHECK_NOTNULL(input_tensors); - ctext->input_list(name, input_tensors); + TF_CHECK_OK(ctext->input_list(name, input_tensors)); } -#ifdef INTEL_MKL_ML_ONLY - -inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name, - MklShapeList* mkl_shapes) { - OpInputList input_mkl_tensors; - GetMklInputList(ctext, strings::StrCat("mkl_", name), &input_mkl_tensors); - - for (int i = 0; i < input_mkl_tensors.size(); i++) { - (*mkl_shapes)[i].DeSerializeMklShape( - input_mkl_tensors[i].flat().data(), - input_mkl_tensors[i].flat().size() * sizeof(uint8)); - } -} - -#else - inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name, MklDnnShapeList* mkl_shapes) { OpInputList input_mkl_tensors; @@ -958,9 +596,6 @@ inline void GetMklShapeList(OpKernelContext* ctext, StringPiece name, } } -#endif - -#ifndef INTEL_MKL_ML_ONLY /// Get shape of input tensor pointed by 'input_idx' in TensorShape format. /// If the input tensor is in MKL layout, then obtains TensorShape from /// MklShape. @@ -979,25 +614,7 @@ inline TensorShape GetTfShape(OpKernelContext* context, size_t input_idx, return t.shape(); } } -#endif -#ifdef INTEL_MKL_ML_ONLY -// Allocate the second output tensor that will contain -// the MKL shape serialized -inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, - const MklShape& mkl_shape) { - Tensor* second_tensor = nullptr; - TensorShape second_shape; - second_shape.AddDim(SIZE_OF_MKL_SERIAL_DATA(mkl_shape.GetDimension())); - OP_REQUIRES_OK(ctext, ctext->allocate_output( - GetTensorMetaDataIndex(n, ctext->num_outputs()), - second_shape, &second_tensor)); - mkl_shape.SerializeMklShape( - second_tensor->flat().data(), - second_tensor->flat().size() * sizeof(uint8)); -} - -#else // Allocate the second output tensor that will contain // the MKL shape serialized inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, @@ -1012,30 +629,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, second_tensor->flat().data(), second_tensor->flat().size() * sizeof(uint8)); } -#endif -#ifdef INTEL_MKL_ML_ONLY -// Allocate the output tensor, create a second output tensor that will contain -// the MKL shape serialized -inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, - Tensor** output, - const TensorShape& tf_shape, - const MklShape& mkl_shape) { - Tensor* second_tensor = nullptr; - TensorShape second_shape; - second_shape.AddDim(SIZE_OF_MKL_SERIAL_DATA(mkl_shape.GetDimension())); - OP_REQUIRES_OK( - ctext, ctext->allocate_output(GetTensorDataIndex(n, ctext->num_outputs()), - tf_shape, output)); - OP_REQUIRES_OK(ctext, ctext->allocate_output( - GetTensorMetaDataIndex(n, ctext->num_outputs()), - second_shape, &second_tensor)); - mkl_shape.SerializeMklShape( - second_tensor->flat().data(), - second_tensor->flat().size() * sizeof(uint8)); -} - -#else // Allocate the output tensor, create a second output tensor that will contain // the MKL shape serialized inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, @@ -1058,11 +652,8 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n, second_tensor->flat().size() * sizeof(uint8)); } } -#endif // Allocates a temp tensor and returns the data buffer for temporary storage. -// Currently -#ifndef INTEL_MKL_ML_ONLY template inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, const memory::primitive_desc& pd, void** buf_out) { @@ -1073,21 +664,7 @@ inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, tf_shape, tensor_out)); *buf_out = static_cast(tensor_out->flat().data()); } -#else -inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, - dnnLayout_t lt_buff, void** buf_out) { - TensorShape tf_shape; - tf_shape.AddDim( - dnnLayoutGetMemorySize_F32(static_cast(lt_buff)) / - sizeof(float) + - 1); - OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum::v(), - tf_shape, tensor_out)); - *buf_out = static_cast(tensor_out->flat().data()); -} - -#endif template inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, TensorShape tf_shape) { @@ -1111,59 +688,6 @@ inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides, } } -#ifdef INTEL_MKL_ML_ONLY -inline void MklSizesToTFSizes(OpKernelContext* context, - TensorFormat data_format_, - const MklShape& mkl_shape, - TensorShape* tf_shape) { - size_t tf_dim = mkl_shape.GetDimension(); - const size_t* tf_sizes = mkl_shape.GetSizes(); - - OP_REQUIRES(context, tf_dim == 4, - errors::InvalidArgument("MKLSizesToTFSizes: size must be 4-dim")); - std::vector sizes; - - sizes.push_back(tf_sizes[3]); - - if (data_format_ == FORMAT_NHWC) { - sizes.push_back(tf_sizes[1]); - sizes.push_back(tf_sizes[0]); - sizes.push_back(tf_sizes[2]); - } else { - sizes.push_back(tf_sizes[2]); - sizes.push_back(tf_sizes[1]); - sizes.push_back(tf_sizes[0]); - } - - OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(sizes, tf_shape)); -} -#endif - -inline int32 GetMklTensorDimIndex(char dimension) { - switch (dimension) { - case 'N': - return MklDims::N; - case 'C': - return MklDims::C; - case 'H': - return MklDims::H; - case 'W': - return MklDims::W; - default: - LOG(FATAL) << "Invalid dimension: " << dimension; - return -1; // Avoid compiler warning about missing return value - } -} - -#ifdef INTEL_MKL_ML_ONLY -inline int64 GetMklTensorDim(const MklShape& mkl_shape, char dimension) { - int index = GetMklTensorDimIndex(dimension); - CHECK(index >= 0 && index < mkl_shape.GetDimension()) - << "Invalid index from the dimension: " << index << ", " << dimension; - return mkl_shape.dim_size(index); -} -#endif - inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in, int idx_out) { int num_inputs = context->num_inputs(); @@ -1185,25 +709,6 @@ inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in, context->set_output(idx_meta_out, meta_output); } -#ifdef INTEL_MKL_ML_ONLY -inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in, - int idx_out, - const TensorShape& shape) { - int num_inputs = context->num_inputs(); - int num_outputs = context->num_outputs(); - int idx_data_in = GetTensorDataIndex(idx_in, num_inputs); - int idx_data_out = GetTensorDataIndex(idx_out, num_outputs); - - const Tensor& data = context->input(idx_data_in); - MklShape mkl_shape_output; - mkl_shape_output.SetMklTensor(false); - AllocateOutputSetMklShape(context, idx_out, mkl_shape_output); - Tensor output(data.dtype()); - // TODO(intel_tf): alternatively, call forward_input_to_output_with_shape(...) - CHECK(output.CopyFrom(data, shape)); - context->set_output(idx_data_out, output); -} -#else inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in, int idx_out, const TensorShape& shape) { @@ -1221,28 +726,6 @@ inline void CopyTfTensorInToOutWithShape(OpKernelContext* context, int idx_in, CHECK(output.CopyFrom(data, shape)); context->set_output(idx_data_out, output); } -#endif - -#ifdef INTEL_MKL_ML_ONLY - -inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in, - int idx_out) { - int num_inputs = context->num_inputs(); - int num_outputs = context->num_outputs(); - int idx_data_in = GetTensorDataIndex(idx_in, num_inputs); - int idx_data_out = GetTensorDataIndex(idx_out, num_outputs); - - MklShape mkl_shape_output; - mkl_shape_output.SetMklTensor(false); - AllocateOutputSetMklShape(context, idx_out, mkl_shape_output); - if (IsRefType(context->input_dtype(idx_data_in))) { - context->forward_ref_input_to_ref_output(idx_data_in, idx_data_out); - } else { - context->set_output(idx_data_out, context->input(idx_data_in)); - } -} - -#else inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in, int idx_out) { @@ -1261,8 +744,6 @@ inline void ForwardTfTensorInToOut(OpKernelContext* context, int idx_in, } } -#endif - inline void ForwardMklTensorInToOut(OpKernelContext* context, int idx_in, int idx_out) { int num_inputs = context->num_inputs(); @@ -1281,7 +762,6 @@ inline void ForwardMklTensorInToOut(OpKernelContext* context, int idx_in, } } -#ifndef INTEL_MKL_ML_ONLY // Set a dummy MKLDNN shape (called when the output is in TF format) inline void SetDummyMklDnnShapeOutput(OpKernelContext* context, uint32 idx_data_out) { @@ -1306,7 +786,6 @@ inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context, context->set_output(idx_data_out, context->input(idx_data_in)); } } -#endif // Forward the MKL shape ONLY (used in elementwise and other ops where // we call the eigen implementation and MKL shape is not used) @@ -1325,125 +804,8 @@ inline void ForwardMklMetaDataInToOut(OpKernelContext* context, } } -#ifdef INTEL_MKL_ML_ONLY -// Set a dummy MKL shape (called when the output is in TF format) -inline void SetDummyMklShapeOutput(OpKernelContext* context, - uint32 idx_data_out) { - MklShape mkl_shape_output; - mkl_shape_output.SetMklTensor(false); - AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output); -} -// We don't need these functions in MKLDNN. We have defined equality operator -// on MklDnnShape class directly. - -// Checks if the TF shape for both MKL tensors is the same or not -// Returns: true if both TF shapes are the same, false otherwise -inline bool MklCompareShapes(const MklShape* input_shape_0, - const MklShape* input_shape_1) { - // Check for number of dimensions - if (input_shape_0->GetDimension() != input_shape_1->GetDimension()) { - return false; - } - - // Check size of each dimension - size_t ndims = input_shape_0->GetDimension(); - for (size_t i = 0; i < ndims; i++) { - if (input_shape_0->dim_size(i) != input_shape_1->dim_size(i)) { - return false; - } - } - - return true; -} - -// Checks if the TF shape for both tensors is the same or not -// Returns: true if TF shapes for both are the same, false otherwise -inline bool MklCompareShapes(const MklShape* input_shape_0, - const TensorShape* input_shape_1) { - // Check for number of dimensions - if (input_shape_0->GetDimension() != input_shape_1->dims()) { - return false; - } - - // Check size of each dimension - size_t ndims = input_shape_0->GetDimension(); - for (size_t i = 0; i < ndims; i++) { - if (input_shape_0->tf_dim_size(i) != input_shape_1->dim_size(i)) { - return false; - } - } - - return true; -} - -// Checks if the TF shape for both tensors is the same or not -// Returns: true if TF shapes for both are the same, false otherwise -inline bool MklCompareShapes(const TensorShape* input_shape_0, - const MklShape* input_shape_1) { - return MklCompareShapes(input_shape_1, input_shape_0); -} - -// Checks if the TF shape for both tensors is the same or not -// Returns: true if TF shapes for both are the same, false otherwise -inline bool MklCompareShapes(const TensorShape* input_shape_0, - const TensorShape* input_shape_1) { - // Check for number of dimensions - if (input_shape_0->dims() != input_shape_1->dims()) { - return false; - } - - // Check size of each dimension - size_t ndims = input_shape_0->dims(); - for (size_t i = 0; i < ndims; i++) { - if (input_shape_0->dim_size(i) != input_shape_1->dim_size(i)) { - return false; - } - } - - return true; -} - -// These functions do not compile with MKL-DNN since mkl.h is missing. -// We may need to remove them later. -// TODO(intel_tf): Remove this routine when faster MKL layout conversion is -// out. -inline void MklNHWCToNCHW(const Tensor& input, Tensor** output) { - const float* buf_in = input.flat().data(); - float* buf_out = (*output)->flat().data(); - - int64 N = input.dim_size(0); - int64 H = input.dim_size(1); - int64 W = input.dim_size(2); - int64 C = input.dim_size(3); - int64 stride_n = H * W * C; -#pragma omp parallel for num_threads(16) - for (int64 n = 0; n < N; ++n) { - mkl_somatcopy('R', 'T', H * W, C, 1, buf_in + n * stride_n, C, - buf_out + n * stride_n, H * W); - } -} - -inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) { - const float* buf_in = input.flat().data(); - float* buf_out = (*output)->flat().data(); - - int64 N = (*output)->dim_size(0); - int64 H = (*output)->dim_size(1); - int64 W = (*output)->dim_size(2); - int64 C = (*output)->dim_size(3); - int64 stride_n = H * W * C; -#pragma omp parallel for num_threads(16) - for (int64 n = 0; n < N; ++n) { - mkl_somatcopy('R', 'T', C, H * W, 1, buf_in + n * stride_n, H * W, - buf_out + n * stride_n, C); - } -} - -#endif // ------------------------------------------------------------------- -#ifndef INTEL_MKL_ML_ONLY - /// Return MKL-DNN data type (memory::data_type) for input type T /// /// @input None @@ -1457,14 +819,17 @@ template <> memory::data_type MklDnnType() { return memory::data_type::f32; } + template <> memory::data_type MklDnnType() { return memory::data_type::u8; } + template <> memory::data_type MklDnnType() { return memory::data_type::s8; } + template <> memory::data_type MklDnnType() { return memory::data_type::s32; @@ -1524,8 +889,7 @@ inline TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format) { /// /// This function will simply map input TensorShape into MKL-DNN dims /// naively. So it will preserve the order of dimensions. E.g., if -/// input tensor is in NHWC format, then dims will be in NHWC format -/// also. +/// input tensor is in NHWC format, then dims will be in NHWC format also. /// /// @input TensorShape object in shape /// @return memory::dims corresponding to TensorShape @@ -1563,7 +927,7 @@ inline memory::dims TFShapeToMklDnnDimsInNCHW(const TensorShape& shape, inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape, TensorFormat format) { - // Check validity of format. + // Validate format. CHECK_NE(TFDataFormatToMklDnn3DDataFormat(format), memory::format::format_undef); @@ -1581,7 +945,7 @@ inline memory::dims TFShapeToMklDnnDimsInNCDHW(const TensorShape& shape, /// self-explanatory. inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims, TensorFormat format) { - // Check validity of format. + // Validate format. CHECK_NE(TFDataFormatToMklDnnDataFormat(format), memory::format::format_undef); @@ -1674,10 +1038,9 @@ inline memory::desc CreateBlockedMemDescHelper(const memory::dims& dim, template inline primitive FindOrCreateReorder(const memory* from, const memory* to); -/* - * Class to represent all the resources corresponding to a tensor in TensorFlow - * that are required to execute an operation (such as Convolution). - */ + +// Class to represent all the resources corresponding to a tensor in TensorFlow +// that are required to execute an operation (such as Convolution). template class MklDnnData { private: @@ -1721,7 +1084,6 @@ class MklDnnData { } void SetIs3DData(bool bIs3D_) { bIs3D = bIs3D_; } - bool GetIs3D() { return bIs3D; } /// Set user memory primitive using specified dimensions, memory format and @@ -1940,9 +1302,9 @@ class MklDnnData { return false; } - /// TODO: this is a faster path with reorder primitive cache compared with - /// CheckReorderToOpMem(..., std::vector* net), will remove - /// slow path in the future + /// This is a faster path with reorder primitive cache compared with + /// CheckReorderToOpMem(..., std::vector* net). + /// TODO(gzmkl): Remove the slower path. inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd) { CHECK_NOTNULL(user_memory_); if (IsReorderNeeded(op_pd)) { @@ -1983,9 +1345,9 @@ class MklDnnData { return false; } - /// TODO: this is a faster path with reorder primitive cache compared with - /// CheckReorderToOpMem(..., std::vector* net), will remove - /// slow path in the future + /// This is a faster path with reorder primitive cache compared with + /// CheckReorderToOpMem(..., std::vector* net). + /// The slower path will be removed in the future inline bool CheckReorderToOpMem(const memory::primitive_desc& op_pd, void* reorder_data_handle) { CHECK_NOTNULL(reorder_data_handle); @@ -2082,7 +1444,6 @@ class MklDnnData { }; /// Base class for operations with reuse of primitives -/// class MklPrimitive { public: virtual ~MklPrimitive() {} @@ -2408,8 +1769,6 @@ inline bool IsConv1x1StrideNot1(memory::dims filter_dims, ((strides[0] != 1) || (strides[1] != 1))); } -#endif // INTEL_MKL_DNN - } // namespace tensorflow #endif // INTEL_MKL #endif // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_ From 4de4b9b511645133b7d47e8f3225914ff9d2db80 Mon Sep 17 00:00:00 2001 From: Dong Lin Date: Tue, 23 Jul 2019 14:08:11 -0700 Subject: [PATCH 0403/3053] Wait on condition variable with timeout instead of sleeping in RunHandler PiperOrigin-RevId: 259607611 --- tensorflow/core/framework/run_handler.cc | 289 ++++++++++++++++------- 1 file changed, 204 insertions(+), 85 deletions(-) diff --git a/tensorflow/core/framework/run_handler.cc b/tensorflow/core/framework/run_handler.cc index f902eb69bd1..d851d56f9f1 100644 --- a/tensorflow/core/framework/run_handler.cc +++ b/tensorflow/core/framework/run_handler.cc @@ -94,11 +94,120 @@ class RunHandlerEnvironment { } }; +typedef typename RunHandlerEnvironment::Task Task; +typedef Eigen::RunQueue Queue; + +class ThreadWorkSource { + public: + ThreadWorkSource() + : blocking_inflight_(0), non_blocking_inflight_(0), traceme_id_(0) { + queue_waiters_.next = &queue_waiters_; + queue_waiters_.prev = &queue_waiters_; + } + + Task EnqueueTask(Task t, bool is_blocking) { + { + Queue* task_queue = + is_blocking ? &blocking_work_queue_ : &non_blocking_work_queue_; + mutex_lock l(queue_mu_); + // For a given queue, only one thread can call PushFront. + t = task_queue->PushFront(std::move(t)); + // Only wake up the thread that can take tasks from both blocking and + // non-blocking queues. The rational is that we don't want to wake up more + // threads than the available physical cores for them to compete for + // resource. The non-blocking threads are used only to compensate for + // threads that may be blocked on some tasks. There is less need to + // proactively wake up those threads. + queue_waiters_.next->cv.notify_one(); + } + VLOG(3) << "Added " << (is_blocking ? "inter" : "intra") << " work from " + << traceme_id_.load(std::memory_order_relaxed); + return t; + } + + Task PopTask(bool is_blocking) { + Queue* task_queue = + is_blocking ? &blocking_work_queue_ : &non_blocking_work_queue_; + + return task_queue->PopBack(); + } + + void WaitIfTaskQueuesEmpty(int max_sleep_micros) { + mutex_lock l(queue_mu_); + if (!blocking_work_queue_.Empty() || !non_blocking_work_queue_.Empty()) { + return; + } + + Waiter waiter; + // Add waiter to the LIFO queue + waiter.prev = &queue_waiters_; + waiter.next = queue_waiters_.next; + waiter.next->prev = &waiter; + waiter.prev->next = &waiter; + // Wait on the condition variable + waiter.cv.wait_for(l, std::chrono::microseconds(max_sleep_micros)); + // Remove waiter from the LIFO queue + waiter.next->prev = waiter.prev; + waiter.prev->next = waiter.next; + } + + int TaskQueueSize(bool is_blocking) { + Queue* task_queue = + is_blocking ? &blocking_work_queue_ : &non_blocking_work_queue_; + return task_queue->Size(); + } + + int64 GetTracemeId() { return traceme_id_.load(std::memory_order_relaxed); } + + void SetTracemeId(int64 value) { traceme_id_ = value; } + + int64 GetInflightTaskCount(bool is_blocking) { + std::atomic* counter = + is_blocking ? &blocking_inflight_ : &non_blocking_inflight_; + return counter->load(std::memory_order_relaxed); + } + + void IncrementInflightTaskCount(bool is_blocking) { + std::atomic* counter = + is_blocking ? &blocking_inflight_ : &non_blocking_inflight_; + counter->fetch_add(1, std::memory_order_relaxed); + } + + void DecrementInflightTaskCount(bool is_blocking) { + std::atomic* counter = + is_blocking ? &blocking_inflight_ : &non_blocking_inflight_; + counter->fetch_sub(1, std::memory_order_relaxed); + } + + std::string ToString() { + return strings::StrCat("traceme_id = ", GetTracemeId(), + ", inter queue size = ", TaskQueueSize(true), + ", inter inflight = ", GetInflightTaskCount(true), + ", intra queue size = ", TaskQueueSize(false), + ", intra inflight = ", GetInflightTaskCount(false)); + } + + private: + // To reduce cache misses, we use a doubly-linked list of Waiter structs and + // queue them in LIFO order rather than the FIFO order used by a single + // condition variable. + struct Waiter { + condition_variable cv; + Waiter* next; + Waiter* prev; + }; + + std::atomic blocking_inflight_; + std::atomic non_blocking_inflight_; + Queue blocking_work_queue_; + Queue non_blocking_work_queue_; + mutex queue_mu_; + Waiter queue_waiters_ GUARDED_BY(queue_mu_); + std::atomic traceme_id_; +}; + class RunHandlerThreadPool { public: - typedef typename RunHandlerEnvironment::Task Task; - typedef Eigen::RunQueue Queue; - struct PerThread { constexpr PerThread() : pool(nullptr), thread_id(-1) {} RunHandlerThreadPool* pool; // Parent pool, or null for normal threads. @@ -133,36 +242,21 @@ class RunHandlerThreadPool { cancelled_ = true; for (size_t i = 0; i < thread_data_.size(); ++i) { + { + mutex_lock l(thread_data_[i].mu); + thread_data_[i].sources_not_empty.notify_all(); + } thread_data_[i].thread.reset(); } } - struct ThreadWorkSource { - ThreadWorkSource() - : blocking_inflight(0), non_blocking_inflight(0), traceme_id(0) {} - Queue blocking_work_queue; - std::atomic blocking_inflight; - mutex blocking_mu; - Queue non_blocking_work_queue; - std::atomic non_blocking_inflight; - mutex non_blocking_mu; - std::atomic traceme_id; - }; - - void AddWorkToQueue(Queue* q, mutex* mu, bool inter_work, - std::atomic* traceme_id, + void AddWorkToQueue(ThreadWorkSource* tws, bool is_blocking, std::function fn) { Task t = env_.CreateTask(std::move(fn)); - { - mutex_lock l(*mu); - // For a given queue, only one thread can call PushFront. - t = q->PushFront(std::move(t)); - VLOG(3) << "Added " << (inter_work ? "inter" : "intra") << " work from " - << traceme_id->load(std::memory_order_relaxed); - } + t = tws->EnqueueTask(std::move(t), is_blocking); if (t.f) { - VLOG(3) << "Running " << (inter_work ? "inter" : "intra") << " work from " - << traceme_id->load(std::memory_order_relaxed); + VLOG(3) << "Running " << (is_blocking ? "inter" : "intra") << " work for " + << tws->GetTracemeId(); env_.ExecuteTask(t); } } @@ -189,6 +283,7 @@ class RunHandlerThreadPool { thread_work_sources[i]); } } + thread_data_[tid].sources_not_empty.notify_all(); } PerThread* GetPerThread() { @@ -215,10 +310,14 @@ class RunHandlerThreadPool { void WorkerLoop(int thread_id, bool may_steal_blocking_work); + void MaybeWaitForWork(bool is_blocking, int thread_id, + int32 max_blocking_inflight); + private: struct ThreadData { ThreadData() : thread_work_sources(kMaxConcurrentHandlers) {} mutex mu; + condition_variable sources_not_empty; std::unique_ptr thread; Eigen::MaxSizeVector thread_work_sources GUARDED_BY(mu); }; @@ -238,12 +337,12 @@ void RunHandlerThreadPool::WorkerLoop(int thread_id, PerThread* pt = GetPerThread(); pt->pool = this; pt->thread_id = thread_id; + static constexpr int32 kMaxBlockingInflight = 10; while (!cancelled_) { Task t; - bool inter_work = true; - std::atomic* inflight_counter = nullptr; - int64 traceme_id = 0; + ThreadWorkSource* tws = nullptr; + bool task_from_blocking_queue = true; Eigen::MaxSizeVector* thread_work_sources = &thread_data_[thread_id].thread_work_sources; { @@ -252,26 +351,20 @@ void RunHandlerThreadPool::WorkerLoop(int thread_id, mutex_lock l(thread_data_[thread_id].mu); for (int i = 0; i < thread_work_sources->size(); ++i) { - ThreadWorkSource* tws = (*thread_work_sources)[i]; + tws = (*thread_work_sources)[i]; // We want a smallish numbers of inter threads since // otherwise there will be contention in PropagateOutputs. // This is best effort policy. - static constexpr int32 kMaxBlockingInflight = 10; if (may_steal_blocking_work && - (tws->blocking_inflight.load(std::memory_order_relaxed) < - kMaxBlockingInflight)) { - t = tws->blocking_work_queue.PopBack(); + tws->GetInflightTaskCount(true) < kMaxBlockingInflight) { + t = tws->PopTask(true); if (t.f) { - inflight_counter = &(tws->blocking_inflight); - traceme_id = tws->traceme_id.load(std::memory_order_relaxed); break; } } - t = tws->non_blocking_work_queue.PopBack(); + t = tws->PopTask(false); if (t.f) { - inflight_counter = &(tws->non_blocking_inflight); - traceme_id = tws->traceme_id.load(std::memory_order_relaxed); - inter_work = false; + task_from_blocking_queue = false; break; } } @@ -279,15 +372,16 @@ void RunHandlerThreadPool::WorkerLoop(int thread_id, if (t.f) { profiler::TraceMe activity( [=] { - return strings::StrCat(inter_work ? "inter" : "intra", " ", - "#id = ", traceme_id, " ", thread_id, "#"); + return strings::StrCat(task_from_blocking_queue ? "inter" : "intra", + " #id = ", tws->GetTracemeId(), " ", + thread_id, "#"); }, profiler::TraceMeLevel::kInfo); - VLOG(2) << "Running " << (inter_work ? "inter" : "intra") << " work from " - << traceme_id; - inflight_counter->fetch_add(1, std::memory_order_relaxed); + VLOG(2) << "Running " << (task_from_blocking_queue ? "inter" : "intra") + << " work from " << tws->GetTracemeId(); + tws->IncrementInflightTaskCount(task_from_blocking_queue); env_.ExecuteTask(t); - inflight_counter->fetch_sub(1, std::memory_order_relaxed); + tws->DecrementInflightTaskCount(task_from_blocking_queue); } else { profiler::TraceMe activity( [=] { @@ -297,22 +391,49 @@ void RunHandlerThreadPool::WorkerLoop(int thread_id, if (VLOG_IS_ON(4)) { mutex_lock l(thread_data_[thread_id].mu); for (int i = 0; i < thread_work_sources->size(); ++i) { - ThreadWorkSource* tws = (*thread_work_sources)[i]; - VLOG(4) << "source id " << i << " traceme_id = " - << tws->traceme_id.load(std::memory_order_relaxed) - << " inter queue size " << tws->blocking_work_queue.Size() - << " inter inflight " - << tws->blocking_inflight.load(std::memory_order_relaxed) - << " intra queue size " << tws->non_blocking_work_queue.Size() - << " intra inflight " - << tws->non_blocking_inflight.load(std::memory_order_relaxed); + VLOG(4) << "source id " << i << " " + << (*thread_work_sources)[i]->ToString(); } } - Env::Default()->SleepForMicroseconds(250); + + MaybeWaitForWork(may_steal_blocking_work, thread_id, + kMaxBlockingInflight); } } } +void RunHandlerThreadPool::MaybeWaitForWork(bool is_blocking, int thread_id, + int32 max_blocking_inflight) { + const int kMaxSleepMicros = 250; + + // The non-blocking thread will just sleep. + if (!is_blocking) { + Env::Default()->SleepForMicroseconds(kMaxSleepMicros); + return; + } + + ThreadWorkSource* tws = nullptr; + { + Eigen::MaxSizeVector* thread_work_sources = + &thread_data_[thread_id].thread_work_sources; + mutex_lock l(thread_data_[thread_id].mu); + while (!cancelled_ && thread_work_sources->empty()) { + // Wait until there is new request + thread_data_[thread_id].sources_not_empty.wait(l); + } + if (cancelled_) { + return; + } + tws = (*thread_work_sources)[0]; + } + + if (tws->GetInflightTaskCount(true) >= max_blocking_inflight) { + // Sleep to reduce contention in PropagateOutputs + Env::Default()->SleepForMicroseconds(kMaxSleepMicros); + } + tws->WaitIfTaskQueuesEmpty(kMaxSleepMicros); +} + } // namespace // Contains the concrete implementation of the RunHandler. @@ -338,7 +459,7 @@ class RunHandler::Impl { RunHandlerPool::Impl* pool_impl() { return pool_impl_; } - RunHandlerThreadPool::ThreadWorkSource* tws() { return &tws_; } + ThreadWorkSource* tws() { return &tws_; } private: class ThreadPoolInterfaceWrapper : public thread::ThreadPoolInterface { @@ -358,7 +479,7 @@ class RunHandler::Impl { uint64 start_time_us_; int64 step_id_; std::unique_ptr thread_pool_interface_; - RunHandlerThreadPool::ThreadWorkSource tws_; + ThreadWorkSource tws_; }; // Contains shared state across all run handlers present in the pool. Also @@ -419,8 +540,8 @@ class RunHandlerPool::Impl { mutex_lock l(mu_); DCHECK_GT(sorted_active_handlers_.size(), 0); - CHECK_EQ(handler->tws()->blocking_work_queue.Size(), 0); - CHECK_EQ(handler->tws()->non_blocking_work_queue.Size(), 0); + CHECK_EQ(handler->tws()->TaskQueueSize(true), 0); + CHECK_EQ(handler->tws()->TaskQueueSize(false), 0); uint64 now = tensorflow::Env::Default()->NowMicros(); double elapsed = (now - handler->start_time_us()) / 1000.0; @@ -472,8 +593,8 @@ class RunHandlerPool::Impl { void RunHandlerPool::Impl::RecomputePoolStatsLocked() { int num_active_requests = sorted_active_handlers_.size(); if (num_active_requests == 0) return; - Eigen::MaxSizeVector - thread_work_sources(num_active_requests); + Eigen::MaxSizeVector thread_work_sources( + num_active_requests); thread_work_sources.resize(num_active_requests); for (int i = 0; i < num_active_requests; ++i) { @@ -482,21 +603,24 @@ void RunHandlerPool::Impl::RecomputePoolStatsLocked() { int num_threads = run_handler_thread_pool()->NumThreads(); int num_blocking_threads = run_handler_thread_pool()->NumBlockingThreads(); + int num_non_blocking_threads = num_threads - num_blocking_threads; + std::vector request_idx_list = ChooseRequestsWithExponentialDistribution( num_active_requests, num_blocking_threads); - - for (int tid = 0; tid < num_blocking_threads; ++tid) { - VLOG(2) << "Set work for tid=" << tid - << " with start_request_idx=" << request_idx_list[tid]; - run_handler_thread_pool()->SetThreadWorkSources(tid, request_idx_list[tid], + for (int i = 0; i < num_blocking_threads; ++i) { + VLOG(2) << "Set work for tid=" << i + << " with start_request_idx=" << request_idx_list[i]; + run_handler_thread_pool()->SetThreadWorkSources(i, request_idx_list[i], thread_work_sources); } - // Non-blocking (i.e. intra-op) threads always steal requests in FIFO order - for (int tid = num_blocking_threads; tid < num_threads; ++tid) { - VLOG(2) << "Set work for tid=" << tid << " with start_request_idx=0"; - run_handler_thread_pool()->SetThreadWorkSources(tid, 0, - thread_work_sources); + request_idx_list = ChooseRequestsWithExponentialDistribution( + num_active_requests, num_non_blocking_threads); + for (int i = 0; i < num_non_blocking_threads; ++i) { + VLOG(2) << "Set work for tid=" << (i + num_blocking_threads) + << " with start_request_idx=" << request_idx_list[i]; + run_handler_thread_pool()->SetThreadWorkSources( + i + num_blocking_threads, request_idx_list[i], thread_work_sources); } if (iterations_++ % 50000 == 10 && VLOG_IS_ON(1)) { @@ -514,8 +638,7 @@ void RunHandlerPool::Impl::RecomputePoolStatsLocked() { times_str += strings::StrCat( (now - sorted_active_handlers_[i]->start_time_us()) / 1000.0, " ms."); ids_str += - strings::StrCat(sorted_active_handlers_[i]->tws()->traceme_id.load( - std::memory_order_relaxed)); + strings::StrCat(sorted_active_handlers_[i]->tws()->GetTracemeId()); } VLOG(1) << "Elapsed times are: " << times_str; VLOG(1) << "Step ids are: " << ids_str; @@ -545,25 +668,21 @@ RunHandler::Impl::Impl(RunHandlerPool::Impl* pool_impl) } void RunHandler::Impl::ScheduleInterOpClosure(std::function fn) { - VLOG(3) << "Scheduling inter work for " - << tws()->traceme_id.load(std::memory_order_relaxed); - pool_impl_->run_handler_thread_pool()->AddWorkToQueue( - &tws()->blocking_work_queue, &tws()->blocking_mu, true, - &tws()->traceme_id, std::move(fn)); + VLOG(3) << "Scheduling inter work for " << tws()->GetTracemeId(); + pool_impl_->run_handler_thread_pool()->AddWorkToQueue(tws(), true, + std::move(fn)); } void RunHandler::Impl::ScheduleIntraOpClosure(std::function fn) { - VLOG(3) << "Scheduling inter work for " - << tws()->traceme_id.load(std::memory_order_relaxed); - pool_impl_->run_handler_thread_pool()->AddWorkToQueue( - &tws()->non_blocking_work_queue, &tws()->non_blocking_mu, false, - &tws()->traceme_id, std::move(fn)); + VLOG(3) << "Scheduling inter work for " << tws()->GetTracemeId(); + pool_impl_->run_handler_thread_pool()->AddWorkToQueue(tws(), false, + std::move(fn)); } void RunHandler::Impl::Reset(int64 step_id) { start_time_us_ = tensorflow::Env::Default()->NowMicros(); step_id_ = step_id; - tws_.traceme_id = step_id; + tws_.SetTracemeId(step_id); } RunHandlerPool::RunHandlerPool(int num_inter_op_threads) From c33f1d1a6186ab0f4a9ca3b9af3a7affc85f251d Mon Sep 17 00:00:00 2001 From: Scott Zhu Date: Tue, 23 Jul 2019 14:08:48 -0700 Subject: [PATCH 0404/3053] Update LSTM/GRU to support masking inputs with CuDNN kernel. Since CuDNN kernel only support right padded data, the GPU specific function has been updated with a tf cond to check that. If the batch of the data meet that criteria, then it could use the CuDNN kernel, otherwise it will fallback to use the normal kernel on GPU. PiperOrigin-RevId: 259607726 --- tensorflow/python/keras/layers/gru_v2_test.py | 56 +++++- .../python/keras/layers/lstm_v2_test.py | 55 +++++ .../python/keras/layers/recurrent_v2.py | 189 +++++++++++++----- 3 files changed, 250 insertions(+), 50 deletions(-) diff --git a/tensorflow/python/keras/layers/gru_v2_test.py b/tensorflow/python/keras/layers/gru_v2_test.py index ca5e6f3d2e7..29c45fce2cf 100644 --- a/tensorflow/python/keras/layers/gru_v2_test.py +++ b/tensorflow/python/keras/layers/gru_v2_test.py @@ -626,9 +626,63 @@ class GRUGraphRewriteTest(keras_parameterized.TestCase): model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime]) self._test_runtime_with_model(model) + def test_GRU_runtime_with_mask(self): + # Masking will affect which backend is selected based on whether the mask + # is strictly right padded. + layer = rnn.GRU(self.rnn_state_size, return_runtime=True) + + inputs = keras.layers.Input( + shape=[self.timestep, self.input_shape], dtype=dtypes.float32) + masked_inputs = keras.layers.Masking()(inputs) + + outputs, runtime = layer(masked_inputs) + # Expand the runtime so that it is a 1D tensor instead of scalar. + # TF model does not work with scalar model output, specially during + # aggregation. + runtime = keras.layers.Lambda( + lambda x: array_ops.expand_dims(x, axis=-1))(runtime) + model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime]) + + (x_train, y_train), _ = testing_utils.get_test_data( + train_samples=self.batch, + test_samples=0, + input_shape=(self.timestep, self.input_shape), + num_classes=self.output_shape) + y_train = keras.utils.to_categorical(y_train, self.output_shape) + + model.compile(optimizer='sgd', + loss=['categorical_crossentropy', None], + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) + + model.fit(x_train, y_train) + + # Verify unpadded data. + _, runtime_value = model.predict(x_train) + if test.is_gpu_available(): + self.assertEqual(runtime_value[0], rnn._RUNTIME_GPU) + else: + self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU) + + # Update x/y to be right padded by setting the last timestep to 0 + x_train[:, -1, :] = 0 + y_train[:, -1] = 0 + _, runtime_value = model.predict(x_train) + if test.is_gpu_available(): + self.assertEqual(runtime_value[0], rnn._RUNTIME_GPU) + else: + self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU) + + # Further update x/y to be mix padded (masks in the middle), and verify + # only cpu kernel can be selected. + x_train[:, -3, :] = 0 + y_train[:, -3] = 0 + _, runtime_value = model.predict(x_train) + self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU) + # Due to b/120160788. @test_util.run_v2_only - def test_UnifiedGRU_with_cond(self): + def test_GRU_runtime_with_cond(self): # This test is to demonstrate the graph rewrite of grappler plugin under # the condition that the function returns different number of internal # states. diff --git a/tensorflow/python/keras/layers/lstm_v2_test.py b/tensorflow/python/keras/layers/lstm_v2_test.py index 4af056a1b31..5ddbf2d046c 100644 --- a/tensorflow/python/keras/layers/lstm_v2_test.py +++ b/tensorflow/python/keras/layers/lstm_v2_test.py @@ -769,6 +769,7 @@ class LSTMGraphRewriteTest(keras_parameterized.TestCase): model.compile(optimizer='sgd', loss=['categorical_crossentropy', None], + run_eagerly=testing_utils.should_run_eagerly(), run_distributed=testing_utils.should_run_distributed()) existing_loss = 0 @@ -800,6 +801,60 @@ class LSTMGraphRewriteTest(keras_parameterized.TestCase): model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime]) self._test_runtime_with_model(model) + def test_LSTM_runtime_with_mask(self): + # Masking will affect which backend is selected based on whether the mask + # is strictly right padded. + layer = rnn.LSTM(self.rnn_state_size, return_runtime=True) + + inputs = keras.layers.Input( + shape=[self.timestep, self.input_shape], dtype=dtypes.float32) + masked_inputs = keras.layers.Masking()(inputs) + + outputs, runtime = layer(masked_inputs) + # Expand the runtime so that it is a 1D tensor instead of scalar. + # TF model does not work with scalar model output, specially during + # aggregation. + runtime = keras.layers.Lambda( + lambda x: array_ops.expand_dims(x, axis=-1))(runtime) + model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime]) + + (x_train, y_train), _ = testing_utils.get_test_data( + train_samples=self.batch, + test_samples=0, + input_shape=(self.timestep, self.input_shape), + num_classes=self.output_shape) + y_train = keras.utils.to_categorical(y_train, self.output_shape) + + model.compile(optimizer='sgd', + loss=['categorical_crossentropy', None], + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) + + model.fit(x_train, y_train) + + # Verify unpadded data. + _, runtime_value = model.predict(x_train) + if test.is_gpu_available(): + self.assertEqual(runtime_value[0], rnn._RUNTIME_GPU) + else: + self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU) + + # Update x/y to be right padded by setting the last timestep to 0 + x_train[:, -1, :] = 0 + y_train[:, -1] = 0 + _, runtime_value = model.predict(x_train) + if test.is_gpu_available(): + self.assertEqual(runtime_value[0], rnn._RUNTIME_GPU) + else: + self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU) + + # Further update x/y to be mix padded (masks in the middle), and verify + # only cpu kernel can be selected. + x_train[:, -3, :] = 0 + y_train[:, -3] = 0 + _, runtime_value = model.predict(x_train) + self.assertEqual(runtime_value[0], rnn._RUNTIME_CPU) + # Due to b/120160788. @test_util.run_v2_only def test_LSTM_runtime_with_cond(self): diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py index 8225a621b10..217403aa641 100644 --- a/tensorflow/python/keras/layers/recurrent_v2.py +++ b/tensorflow/python/keras/layers/recurrent_v2.py @@ -399,22 +399,8 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU): else: last_output, outputs, new_h, runtime = standard_gru(**normal_gru_kwargs) else: - if mask is None: - last_output, outputs, new_h, runtime = gru_with_backend_selection( - normal_gru_kwargs, cudnn_gru_kwargs) - else: - def with_mask_support(): - # TODO(b/134702514): Change to use backend selection. - # return gru_with_backend_selection(normal_gru_kwargs, - # cudnn_gru_kwargs) - return standard_gru(**normal_gru_kwargs) - def without_mask_support(): - return standard_gru(**normal_gru_kwargs) - - last_output, outputs, new_h, runtime = control_flow_ops.cond( - is_sequence_right_padded(mask, self.time_major), - true_fn=with_mask_support, - false_fn=without_mask_support) + last_output, outputs, new_h, runtime = gru_with_backend_selection( + **normal_gru_kwargs) states = [new_h] return last_output, outputs, runtime, states @@ -568,7 +554,9 @@ def cudnn_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major, return last_output, outputs, h, _runtime(_RUNTIME_GPU) -def gru_with_backend_selection(normal_gru_params, cudnn_gru_params): +def gru_with_backend_selection( + inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major, + go_backwards, activation, recurrent_activation): """Call the GRU with optimized backend kernel selection. Under the hood, this function will create two TF function, one with the most @@ -581,12 +569,69 @@ def gru_with_backend_selection(normal_gru_params, cudnn_gru_params): device placement. Args: - normal_gru_params: Dict, parameters for the generic TF function. - cudnn_gru_params: Dict, parameters for the CuDNN specific TF function. + inputs: Input tensor of GRU layer. + init_h: Initial state tensor for the cell output. + kernel: Weights for cell kernel. + recurrent_kernel: Weights for cell recurrent kernel. + bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias + is used in this case. + mask: Boolean tensor for mask out the steps within sequence. + time_major: Boolean, whether the inputs are in the format of + [time, batch, feature] or [batch, time, feature]. + go_backwards: Boolean (default False). If True, process the input sequence + backwards and return the reversed sequence. + activation: Activation function to use for output. + recurrent_activation: Activation function to use for hidden recurrent state. Returns: List of output tensors, same as standard_gru. """ + params = { + 'inputs': inputs, + 'init_h': init_h, + 'kernel': kernel, + 'recurrent_kernel': recurrent_kernel, + 'bias': bias, + 'mask': mask, + 'time_major': time_major, + 'go_backwards': go_backwards, + 'activation': activation, + 'recurrent_activation': recurrent_activation + } + + def cudnn_gru_with_fallback(inputs, init_h, kernel, recurrent_kernel, + bias, mask, time_major, go_backwards, activation, + recurrent_activation): + """Use CuDNN kernel when mask is none or strictly right padded.""" + if mask is None: + return cudnn_gru(inputs=inputs, init_h=init_h, kernel=kernel, + recurrent_kernel=recurrent_kernel, bias=bias, mask=mask, + time_major=time_major, go_backwards=go_backwards) + # Note that mask is a boolean tensor, which doesn't need to do gradient + # calculation, when using tf.cond, a default gradient is added for it, + # which then cause the backward function to have a signature mismatch. + # Force the mask to not generate gradient to allow implementation_selector + # to work properly. + # TODO(b/80444525): Remove the stop_gradient(). + mask = array_ops.stop_gradient(mask) + + def input_right_padded(): + return cudnn_gru(inputs=inputs, init_h=init_h, kernel=kernel, + recurrent_kernel=recurrent_kernel, bias=bias, mask=mask, + time_major=time_major, go_backwards=go_backwards) + + def input_not_right_padded(): + return standard_gru(inputs=inputs, init_h=init_h, kernel=kernel, + recurrent_kernel=recurrent_kernel, bias=bias, + mask=mask, time_major=time_major, + go_backwards=go_backwards, activation=activation, + recurrent_activation=recurrent_activation) + + return control_flow_ops.cond( + is_sequence_right_padded(mask, time_major), + true_fn=input_right_padded, + false_fn=input_not_right_padded) + # Each time a `tf.function` is called, we will give it a unique # identifiable API name, so that Grappler won't get confused when it # sees multiple GRU layers added into same graph, and it will be able @@ -595,14 +640,12 @@ def gru_with_backend_selection(normal_gru_params, cudnn_gru_params): defun_standard_gru = _generate_defun_backend( api_name, _CPU_DEVICE_NAME, standard_gru) defun_cudnn_gru = _generate_defun_backend( - api_name, _GPU_DEVICE_NAME, cudnn_gru) + api_name, _GPU_DEVICE_NAME, cudnn_gru_with_fallback) # Call the normal GRU impl and register the CuDNN impl function. The # grappler will kick in during session execution to optimize the graph. - last_output, outputs, new_h, runtime = defun_standard_gru( - **normal_gru_params) - - function.register(defun_cudnn_gru, **cudnn_gru_params) + last_output, outputs, new_h, runtime = defun_standard_gru(**params) + function.register(defun_cudnn_gru, **params) return last_output, outputs, new_h, runtime @@ -919,24 +962,8 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM): last_output, outputs, new_h, new_c, runtime = standard_lstm( **normal_lstm_kwargs) else: - if mask is None: - (last_output, outputs, - new_h, new_c, runtime) = lstm_with_backend_selection( - normal_lstm_kwargs, cudnn_lstm_kwargs) - else: - def with_mask_support(): - # TODO(b/134702514): Change to use backend selection. - # return lstm_with_backend_selection(normal_lstm_kwargs, - # cudnn_lstm_kwargs) - return standard_lstm(**normal_lstm_kwargs) - def without_mask_support(): - return standard_lstm(**normal_lstm_kwargs) - - (last_output, outputs, - new_h, new_c, runtime) = control_flow_ops.cond( - is_sequence_right_padded(mask, self.time_major), - true_fn=with_mask_support, - false_fn=without_mask_support) + (last_output, outputs, new_h, new_c, + runtime) = lstm_with_backend_selection(**normal_lstm_kwargs) states = [new_h, new_c] @@ -1162,7 +1189,9 @@ def cudnn_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask, return last_output, outputs, h, c, _runtime(_RUNTIME_GPU) -def lstm_with_backend_selection(normal_lstm_params, cudnn_lstm_params): +def lstm_with_backend_selection( + inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask, time_major, + go_backwards, activation, recurrent_activation): """Call the LSTM with optimized backend kernel selection. Under the hood, this function will create two TF function, one with the most @@ -1175,12 +1204,73 @@ def lstm_with_backend_selection(normal_lstm_params, cudnn_lstm_params): device placement. Args: - normal_lstm_params: Dict, parameters for the generic TF function. - cudnn_lstm_params: Dict, parameters for the CuDNN specific TF function. + inputs: Input tensor of LSTM layer. + init_h: Initial state tensor for the cell output. + init_c: Initial state tensor for the cell hidden state. + kernel: Weights for cell kernel. + recurrent_kernel: Weights for cell recurrent kernel. + bias: Weights for cell kernel bias and recurrent bias. Only recurrent bias + is used in this case. + mask: Boolean tensor for mask out the steps within sequence. + time_major: Boolean, whether the inputs are in the format of + [time, batch, feature] or [batch, time, feature]. + go_backwards: Boolean (default False). If True, process the input sequence + backwards and return the reversed sequence. + activation: Activation function to use for output. + recurrent_activation: Activation function to use for hidden recurrent state. Returns: List of output tensors, same as standard_lstm. """ + params = { + 'inputs': inputs, + 'init_h': init_h, + 'init_c': init_c, + 'kernel': kernel, + 'recurrent_kernel': recurrent_kernel, + 'bias': bias, + 'mask': mask, + 'time_major': time_major, + 'go_backwards': go_backwards, + 'activation': activation, + 'recurrent_activation': recurrent_activation + } + + def cudnn_lstm_with_fallback(inputs, init_h, init_c, kernel, recurrent_kernel, + bias, mask, time_major, go_backwards, activation, + recurrent_activation): + """Use CuDNN kernel when mask is none or strictly right padded.""" + if mask is None: + return cudnn_lstm(inputs=inputs, init_h=init_h, init_c=init_c, + kernel=kernel, recurrent_kernel=recurrent_kernel, + bias=bias, mask=mask, time_major=time_major, + go_backwards=go_backwards) + # Note that mask is a boolean tensor, which doesn't need to do gradient + # calculation, when using tf.cond, a default gradient is added for it, + # which then cause the backward function to have a signature mismatch. + # Force the mask to not generate gradient to allow implementation_selector + # to work properly. + # TODO(b/80444525): Remove the stop_gradient(). + mask = array_ops.stop_gradient(mask) + + def input_right_padded(): + return cudnn_lstm(inputs=inputs, init_h=init_h, init_c=init_c, + kernel=kernel, recurrent_kernel=recurrent_kernel, + bias=bias, mask=mask, time_major=time_major, + go_backwards=go_backwards) + + def input_not_right_padded(): + return standard_lstm(inputs=inputs, init_h=init_h, init_c=init_c, + kernel=kernel, recurrent_kernel=recurrent_kernel, + bias=bias, mask=mask, time_major=time_major, + go_backwards=go_backwards, activation=activation, + recurrent_activation=recurrent_activation) + + return control_flow_ops.cond( + is_sequence_right_padded(mask, time_major), + true_fn=input_right_padded, + false_fn=input_not_right_padded) + # Each time a `tf.function` is called, we will give it a unique # identifiable API name, so that Grappler won't get confused when it # sees multiple LSTM layers added into same graph, and it will be able @@ -1189,14 +1279,14 @@ def lstm_with_backend_selection(normal_lstm_params, cudnn_lstm_params): defun_standard_lstm = _generate_defun_backend( api_name, _CPU_DEVICE_NAME, standard_lstm) defun_cudnn_lstm = _generate_defun_backend( - api_name, _GPU_DEVICE_NAME, cudnn_lstm) + api_name, _GPU_DEVICE_NAME, cudnn_lstm_with_fallback) # Call the normal LSTM impl and register the CuDNN impl function. The # grappler will kick in during session execution to optimize the graph. last_output, outputs, new_h, new_c, runtime = defun_standard_lstm( - **normal_lstm_params) + **params) + function.register(defun_cudnn_lstm, **params) - function.register(defun_cudnn_lstm, **cudnn_lstm_params) return last_output, outputs, new_h, new_c, runtime @@ -1264,7 +1354,8 @@ def _generate_defun_backend(unique_api_name, preferred_device, func): _DEFUN_DEVICE_ATTRIBUTE: preferred_device, } return function.defun_with_attributes(func=func, - attributes=function_attributes) + attributes=function_attributes, + autograph=False) def _get_context_device_type(): From 8dc62ccf8218dd2d6d8e1757ef63e7c360d35b4d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 14:11:01 -0700 Subject: [PATCH 0405/3053] Autograph: Fix chained function conversion Chained functions were not correctly converted. For example, `foo().bar().baz()` only converted baz. Now fixed. PiperOrigin-RevId: 259608163 --- .../autograph/converters/asserts_test.py | 2 +- .../converters/break_statements_test.py | 4 +- .../python/autograph/converters/call_trees.py | 12 +-- .../autograph/converters/call_trees_test.py | 86 ++++++++++--------- .../converters/continue_statements_test.py | 2 +- .../autograph/converters/control_flow_test.py | 2 +- .../converters/function_scopes_test.py | 7 +- .../python/autograph/converters/lists_test.py | 4 +- .../converters/side_effect_guards_test.py | 14 +-- .../autograph/converters/slices_test.py | 2 +- .../autograph/core/converter_testing.py | 15 ++-- 11 files changed, 79 insertions(+), 71 deletions(-) diff --git a/tensorflow/python/autograph/converters/asserts_test.py b/tensorflow/python/autograph/converters/asserts_test.py index 9ae448892a0..061b63f9d10 100644 --- a/tensorflow/python/autograph/converters/asserts_test.py +++ b/tensorflow/python/autograph/converters/asserts_test.py @@ -38,7 +38,7 @@ class AssertsTest(converter_testing.TestCase): return tf.no_op() # pylint:disable=undefined-variable with self.converted(test_fn, (asserts, side_effect_guards), {}, - gen_control_flow_ops.no_op) as result: + (gen_control_flow_ops.no_op,)) as result: with self.cached_session() as sess: op = result.test_fn(constant_op.constant(False)) with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, diff --git a/tensorflow/python/autograph/converters/break_statements_test.py b/tensorflow/python/autograph/converters/break_statements_test.py index 816d3bb1b65..c789ced095d 100644 --- a/tensorflow/python/autograph/converters/break_statements_test.py +++ b/tensorflow/python/autograph/converters/break_statements_test.py @@ -28,7 +28,7 @@ class BreakCanonicalizationTest(converter_testing.TestCase): def assertTransformedEquivalent(self, test_fn, *inputs): with self.converted(test_fn, break_statements, {}, - constant_op.constant) as result: + (constant_op.constant,)) as result: self.assertEqual(test_fn(*inputs), result.test_fn(*inputs)) def test_while_loop(self): @@ -58,7 +58,7 @@ class BreakCanonicalizationTest(converter_testing.TestCase): return v with self.converted(test_fn, break_statements, {}, - constant_op.constant) as result: + (constant_op.constant,)) as result: # The break is incompletely canonicalized. The loop will not interrupt, # but the section following the break will be skipped. self.assertEqual([3], result.test_fn([5, 4])) diff --git a/tensorflow/python/autograph/converters/call_trees.py b/tensorflow/python/autograph/converters/call_trees.py index 657d880620f..52e6af52b6f 100644 --- a/tensorflow/python/autograph/converters/call_trees.py +++ b/tensorflow/python/autograph/converters/call_trees.py @@ -71,24 +71,26 @@ class CallTreeTransformer(converter.Base): return node def visit_Call(self, node): + full_name = str(anno.getanno(node.func, anno.Basic.QN, default='')) + node = self.generic_visit(node) + # TODO(mdan): Refactor converted_call as a 'Call' operator. # Calls to the internal 'ag__' module are never converted (though their # arguments might be). - full_name = str(anno.getanno(node.func, anno.Basic.QN, default='')) if full_name.startswith('ag__.'): - return self.generic_visit(node) + return node # Calls to pdb.set_trace or ipdb.set_trace are never converted. We don't use # the normal mechanisms to bypass these literals because they are sensitive # to the frame they are being called from. # TODO(mdan): Generalize this to a "static whitelist" config. if full_name in ('pdb.set_trace', 'ipdb.set_trace'): - return self.generic_visit(node) + return node if (full_name == 'print' and not self.ctx.program.options.uses(converter.Feature.BUILTIN_FUNCTIONS)): - return self.generic_visit(node) + return node func = node.func @@ -99,7 +101,6 @@ class CallTreeTransformer(converter.Base): assert starred_arg is None, 'Multiple *args should be impossible.' starred_arg = a else: - a = self.visit(a) normal_args.append(a) if starred_arg is None: args = templates.replace_as_expression('(args,)', args=normal_args) @@ -116,7 +117,6 @@ class CallTreeTransformer(converter.Base): assert kwargs_arg is None, 'Multiple **kwargs should be impossible.' kwargs_arg = k else: - k = self.visit(k) normal_keywords.append(k) if kwargs_arg is None: if not normal_keywords: diff --git a/tensorflow/python/autograph/converters/call_trees_test.py b/tensorflow/python/autograph/converters/call_trees_test.py index d61908fc8e8..b77248b8711 100644 --- a/tensorflow/python/autograph/converters/call_trees_test.py +++ b/tensorflow/python/autograph/converters/call_trees_test.py @@ -30,52 +30,62 @@ class CallTreesTest(converter_testing.TestCase): def test_normal_function(self): def test_fn(f): - return f() + 3 + return f() + 20 with self.converted(test_fn, call_trees, {}) as result: - self.assertEqual( - result.test_fn(None), - converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 3) + self.assertEqual(result.test_fn(lambda: 1), 21) self.assertListEqual(self.dynamic_calls, [((), None)]) def test_function_with_expression_in_argument(self): def test_fn(f, g): - return f(g() + 7) + 3 + return f(g() + 20) + 4000 with self.converted(test_fn, call_trees, {}) as result: - self.assertEqual( - result.test_fn(None, None), - converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 3) + self.assertEqual(result.test_fn(lambda x: x + 300, lambda: 1), 4321) self.assertListEqual(self.dynamic_calls, [ ((), None), - ((converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 7,), None), + ((21,), None), ]) def test_function_with_call_in_argument(self): def test_fn(f, g): - return f(g()) + 3 + return f(g()) + 300 with self.converted(test_fn, call_trees, {}) as result: - self.assertEqual( - result.test_fn(None, None), - converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 3) + self.assertEqual(result.test_fn(lambda x: x + 20, lambda: 1), 321) self.assertListEqual(self.dynamic_calls, [ ((), None), - ((converter_testing.RESULT_OF_MOCK_CONVERTED_CALL,), None), + ((1,), None), + ]) + + def test_function_chaining(self): + + def get_one(): + return 1 + + def test_fn(): + return get_one().__add__(20) + + with self.converted(test_fn, call_trees, {'get_one': get_one}, + ()) as result: + + self.assertEqual(result.test_fn(), 21) + + self.assertListEqual(self.dynamic_calls, [ + ((), None), + ((20,), None), ]) def test_function_with_kwarg(self): def test_fn(f, a, b): - return f(a, c=b) + 3 + return f(a, c=b) + 300 with self.converted(test_fn, call_trees, {}) as result: - self.assertEqual( - result.test_fn(None, 1, 2), - converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 3) - self.assertListEqual(self.dynamic_calls, [((1,), {'c': 2})]) + self.assertEqual(result.test_fn(lambda a, c: a + c, 1, 20), 321) + self.assertListEqual(self.dynamic_calls, [((1,), {'c': 20})]) def test_function_with_kwargs_starargs(self): @@ -84,25 +94,24 @@ class CallTreesTest(converter_testing.TestCase): with self.converted(test_fn, call_trees, {}) as result: self.assertEqual( - result.test_fn(None, 1, *[2, 3], **{ + result.test_fn(lambda *args, **kwargs: 7, 1, *[2, 3], **{ 'b': 4, 'c': 5 - }), converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 5) + }), 12) self.assertListEqual(self.dynamic_calls, [((1, 2, 3), {'b': 4, 'c': 5})]) def test_function_with_kwargs_starargs_only(self): - def f(*unused_args): # Will not be called. - pass + def f(*args): + return sum(args) def test_fn(): - args = [1, 2, 3] - return f(*args) + 11 + args = [1, 20, 300] + return f(*args) + 4000 with self.converted(test_fn, call_trees, {'f': f}) as result: - self.assertEqual(result.test_fn(), - converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 11) - self.assertListEqual(self.dynamic_calls, [((1, 2, 3), None)]) + self.assertEqual(result.test_fn(), 4321) + self.assertListEqual(self.dynamic_calls, [((1, 20, 300), None)]) def test_function_with_kwargs_keywords(self): @@ -111,8 +120,7 @@ class CallTreesTest(converter_testing.TestCase): with self.converted(test_fn, call_trees, {}) as result: self.assertEqual( - result.test_fn(None, 1, 2, **{'c': 3}), - converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 5) + result.test_fn(lambda *args, **kwargs: 7, 1, 2, **{'c': 3}), 12) self.assertListEqual(self.dynamic_calls, [((1,), {'b': 2, 'c': 3})]) def test_debugger_set_trace(self): @@ -133,32 +141,30 @@ class CallTreesTest(converter_testing.TestCase): class TestClass(object): - def other_method(self, _): - raise ValueError('this should not be called') + def other_method(self, x): + return x + 20 def test_method(self, a): - return self.other_method(a) + 1 + return self.other_method(a) + 300 tc = TestClass() with self.converted(TestClass.test_method, call_trees, {}) as result: - self.assertEqual(converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 1, - result.test_method(tc, 1)) + self.assertEqual(321, result.test_method(tc, 1)) self.assertListEqual(self.dynamic_calls, [((1,), None)]) def test_object_method(self): class TestClass(object): - def other_method(self, _): - raise ValueError('this should not be called') + def other_method(self, x): + return x + 20 def test_method(self, a): - return self.other_method(a) + 1 + return self.other_method(a) + 300 tc = TestClass() with self.converted(tc.test_method, call_trees, {}) as result: - self.assertEqual(converter_testing.RESULT_OF_MOCK_CONVERTED_CALL + 1, - result.test_method(tc, 1)) + self.assertEqual(321, result.test_method(tc, 1)) self.assertListEqual(self.dynamic_calls, [((1,), None)]) diff --git a/tensorflow/python/autograph/converters/continue_statements_test.py b/tensorflow/python/autograph/converters/continue_statements_test.py index 97a975b1698..a24ddd5e527 100644 --- a/tensorflow/python/autograph/converters/continue_statements_test.py +++ b/tensorflow/python/autograph/converters/continue_statements_test.py @@ -29,7 +29,7 @@ class ContinueCanonicalizationTest(converter_testing.TestCase): def assertTransformedEquivalent(self, test_fn, *inputs): with self.converted(test_fn, continue_statements, {'ops': ops}, - constant_op.constant) as result: + (constant_op.constant,)) as result: self.assertEqual(test_fn(*inputs), result.test_fn(*inputs)) def test_basic(self): diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py index 4690b114a77..e1ba82043bc 100644 --- a/tensorflow/python/autograph/converters/control_flow_test.py +++ b/tensorflow/python/autograph/converters/control_flow_test.py @@ -39,7 +39,7 @@ class ControlFlowTest(converter_testing.TestCase): if not symbols: symbols = {} with self.converted(test_fn, control_flow, symbols, - constant_op.constant) as result: + (constant_op.constant,)) as result: self.assertAllEqual(self.evaluate(result.test_fn(*inputs)), expected) @test_util.run_deprecated_v1 diff --git a/tensorflow/python/autograph/converters/function_scopes_test.py b/tensorflow/python/autograph/converters/function_scopes_test.py index 0eccf39db7d..f973687e8bb 100644 --- a/tensorflow/python/autograph/converters/function_scopes_test.py +++ b/tensorflow/python/autograph/converters/function_scopes_test.py @@ -55,7 +55,7 @@ class FunctionBodyTransformerTest(converter_testing.TestCase): return tf.constant(1) with self.converted(test_fn, function_scopes, {}, - constant_op.constant) as result: + (constant_op.constant,)) as result: result_op = result.test_fn() self.assertIn('test_fn/', result_op.op.name) self.assertIn('First sentence.', result.test_fn.__doc__) @@ -72,7 +72,8 @@ class FunctionBodyTransformerTest(converter_testing.TestCase): l += 1 return l, inner_fn(l) - with self.converted(test_fn, function_scopes, {}, ops.name_scope) as result: + with self.converted(test_fn, function_scopes, {}, + (ops.name_scope,)) as result: first, second = result.test_fn(constant_op.constant(1)) self.assertIn('test_fn/', first.op.name) self.assertNotIn('inner_fn', first.op.name) @@ -95,7 +96,7 @@ class FunctionBodyTransformerTest(converter_testing.TestCase): node, ctx = self.prepare(TestClass, ns) node = function_scopes.transform(node, ctx) - with self.compiled(node, {}, ops.name_scope) as result: + with self.compiled(node, {}, (ops.name_scope,)) as result: first, second = result.TestClass().test_fn(constant_op.constant(1)) self.assertIn('TestClass/test_fn/', first.op.name) self.assertNotIn('inner_fn', first.op.name) diff --git a/tensorflow/python/autograph/converters/lists_test.py b/tensorflow/python/autograph/converters/lists_test.py index 39843c7d74f..9436b69d749 100644 --- a/tensorflow/python/autograph/converters/lists_test.py +++ b/tensorflow/python/autograph/converters/lists_test.py @@ -87,7 +87,7 @@ class ListTest(converter_testing.TestCase): } node = lists.transform(node, ctx) - with self.compiled(node, ns, dtypes.int32) as result: + with self.compiled(node, ns, (dtypes.int32,)) as result: with self.cached_session() as sess: ts, tl = result.test_fn() r = list_ops.tensor_list_stack(tl, dtypes.int32) @@ -121,7 +121,7 @@ class ListTest(converter_testing.TestCase): } node = lists.transform(node, ctx) - with self.compiled(node, {}, array_ops.stack, dtypes.int32) as result: + with self.compiled(node, {}, (array_ops.stack, dtypes.int32)) as result: with self.cached_session() as sess: self.assertAllEqual(self.evaluate(result.test_fn()), [1, 2, 3]) diff --git a/tensorflow/python/autograph/converters/side_effect_guards_test.py b/tensorflow/python/autograph/converters/side_effect_guards_test.py index 645267e5600..ead05d041aa 100644 --- a/tensorflow/python/autograph/converters/side_effect_guards_test.py +++ b/tensorflow/python/autograph/converters/side_effect_guards_test.py @@ -47,7 +47,7 @@ class SideEffectGuardsTest(converter_testing.TestCase): self.assertEqual(len(node.body), 1) - with self.compiled(node, {}, state_ops.assign) as result: + with self.compiled(node, {}, (state_ops.assign,)) as result: with self.cached_session() as sess: v = variable_scope.get_variable('test', initializer=2) self.evaluate(v.initializer) @@ -68,7 +68,7 @@ class SideEffectGuardsTest(converter_testing.TestCase): self.assertEqual(len(node.body), 1) - with self.compiled(node, {}, state_ops.assign) as result: + with self.compiled(node, {}, (state_ops.assign,)) as result: with self.cached_session() as sess: v = variable_scope.get_variable('test', initializer=2) self.evaluate(v.initializer) @@ -89,7 +89,7 @@ class SideEffectGuardsTest(converter_testing.TestCase): self.assertEqual(len(node.body), 1) - with self.compiled(node, {}, control_flow_ops.Assert) as result: + with self.compiled(node, {}, (control_flow_ops.Assert,)) as result: with self.cached_session() as sess: with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, 'expected in throw'): @@ -109,7 +109,7 @@ class SideEffectGuardsTest(converter_testing.TestCase): self.assertEqual(len(node.body), 1) - with self.compiled(node, {}, state_ops.assign_add) as result: + with self.compiled(node, {}, (state_ops.assign_add,)) as result: with self.cached_session() as sess: v = variable_scope.get_variable('test', initializer=2) self.evaluate(v.initializer) @@ -130,7 +130,7 @@ class SideEffectGuardsTest(converter_testing.TestCase): self.assertEqual(len(node.body[0].body), 1) - with self.compiled(node, {}, state_ops.assign, ops.name_scope) as result: + with self.compiled(node, {}, (state_ops.assign, ops.name_scope)) as result: with self.cached_session() as sess: v = variable_scope.get_variable('test', initializer=2) self.evaluate(v.initializer) @@ -152,8 +152,8 @@ class SideEffectGuardsTest(converter_testing.TestCase): self.assertEqual(len(node.body), 1) - with self.compiled(node, {}, state_ops.assign, - state_ops.assign_add) as result: + with self.compiled(node, {}, + (state_ops.assign, state_ops.assign_add)) as result: with self.cached_session() as sess: v = variable_scope.get_variable('test', initializer=2) self.evaluate(v.initializer) diff --git a/tensorflow/python/autograph/converters/slices_test.py b/tensorflow/python/autograph/converters/slices_test.py index 11e3736d4fb..2fea1c7f81f 100644 --- a/tensorflow/python/autograph/converters/slices_test.py +++ b/tensorflow/python/autograph/converters/slices_test.py @@ -43,7 +43,7 @@ class SliceTest(converter_testing.TestCase): } node = slices.transform(node, ctx) - with self.compiled(node, {}, dtypes.int32) as result: + with self.compiled(node, {}, (dtypes.int32,)) as result: with self.cached_session() as sess: tl = list_ops.tensor_list_from_tensor( [1, 2], element_shape=constant_op.constant([], dtype=dtypes.int32)) diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py index bb2ed38fbbb..507739fdbc2 100644 --- a/tensorflow/python/autograph/core/converter_testing.py +++ b/tensorflow/python/autograph/core/converter_testing.py @@ -37,8 +37,6 @@ from tensorflow.python.autograph.pyct import pretty_printer from tensorflow.python.autograph.pyct import transformer from tensorflow.python.platform import test -RESULT_OF_MOCK_CONVERTED_CALL = 7 - class TestCase(test.TestCase): """Base class for unit tests in this module. Contains relevant utilities.""" @@ -54,15 +52,17 @@ class TestCase(test.TestCase): sys.stdout = sys.__stdout__ @contextlib.contextmanager - def compiled(self, node, namespace, *symbols): + def compiled(self, node, namespace, symbols=()): source = None self.dynamic_calls = [] # See api.converted_call - def converted_call(unused_f, unused_opts, args, kwargs): + def converted_call(f, unused_opts, args, kwargs): """Mock version of api.converted_call.""" self.dynamic_calls.append((args, kwargs)) - return RESULT_OF_MOCK_CONVERTED_CALL + if kwargs is None: + kwargs = {} + return f(*args, **kwargs) try: result, source, source_map = compiler.ast_to_object( @@ -92,7 +92,8 @@ class TestCase(test.TestCase): raise @contextlib.contextmanager - def converted(self, entity, converter_module, namespace, *tf_symbols): + def converted(self, entity, converter_module, namespace, tf_symbols=()): + node, ctx = self.prepare(entity, namespace) if not isinstance(converter_module, (list, tuple)): @@ -101,7 +102,7 @@ class TestCase(test.TestCase): node = converter.standard_analysis(node, ctx, is_initial=not i) node = m.transform(node, ctx) - with self.compiled(node, namespace, *tf_symbols) as result: + with self.compiled(node, namespace, tf_symbols) as result: yield result def make_fake_mod(self, name, *symbols): From 2bf0af74aeea0c6aa60dca0372e2c0289cf067f8 Mon Sep 17 00:00:00 2001 From: Yuanzhong Xu Date: Tue, 23 Jul 2019 14:16:43 -0700 Subject: [PATCH 0406/3053] [XLA] BF16 propagation: fix bitcast-convert fusion root handling Although we try to skip changing bitcast-convert, but when it is a fusion root, we still change it to match the fusion output. In this case, we now add a convert after the bitcast-convert, instead of changing the shape in-place. This way we still get the benefit of reduced memory write in the fusion. PiperOrigin-RevId: 259609312 --- .../xla/service/bfloat16_propagation.cc | 65 ++++++++++++++++--- .../xla/service/bfloat16_propagation_test.cc | 29 +++++++++ 2 files changed, 85 insertions(+), 9 deletions(-) diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc index 4d465640f2d..6331f02aa81 100644 --- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc +++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc @@ -308,6 +308,28 @@ bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo, return true; } +namespace { + +// Returns whether we should avoid changing the precision of inst regardless of +// the producers and users. +bool ShouldKeepPrecisionUnchanged(const HloInstruction* inst) { + if (inst->opcode() == HloOpcode::kFusion && + inst->fusion_kind() == HloInstruction::FusionKind::kCustom) { + return ShouldKeepPrecisionUnchanged( + inst->fused_instructions_computation()->root_instruction()); + } + // Do not change precision for side-effecting instructions, control flow, and + // bitcast-convert, because this pass might break the interfaces or + // assumptions for them. + return inst->opcode() == HloOpcode::kCustomCall || // + inst->opcode() == HloOpcode::kCall || // + inst->opcode() == HloOpcode::kConditional || // + inst->opcode() == HloOpcode::kBitcastConvert || // + inst->HasSideEffectNoRecurse(); +} + +} // namespace + void BFloat16Propagation::DetermineInstructionPrecision(HloInstruction* hlo, bool skip_parameters) { // We handle any fusion computation or while body/condition after the @@ -354,15 +376,7 @@ void BFloat16Propagation::DetermineInstructionPrecision(HloInstruction* hlo, return; } - // Do not change precision for instructions related to entry and exit of a - // computation, side-effecting instructions, control flow, and - // bitcast-convert, because this pass might break the interfaces or - // assumptions for them. - if (hlo->opcode() == HloOpcode::kCustomCall || // - hlo->opcode() == HloOpcode::kCall || // - hlo->opcode() == HloOpcode::kConditional || // - hlo->opcode() == HloOpcode::kBitcastConvert || // - hlo->HasSideEffectNoRecurse() || // + if (ShouldKeepPrecisionUnchanged(hlo) || (hlo->opcode() == HloOpcode::kParameter && skip_parameters)) { return; } @@ -797,6 +811,39 @@ StatusOr BFloat16Propagation::Run(HloModule* module) { // Apply the changes in changes_to_bf16_. for (auto& change : changes_to_bf16_) { + auto inst = change.first; + // It is possible that we marked inst to change precision even if it is an + // unsupported change, when inst is the root of a fusion computation and it + // has to match the fusion node's output precision. We do a convert instead + // of in-place change for such cases. + if (ShouldKeepPrecisionUnchanged(inst)) { + auto users = inst->users(); + bool is_root = inst == inst->parent()->root_instruction(); + TF_ASSIGN_OR_RETURN( + HloInstruction * copy, + inst->parent()->DeepCopyInstructionWithCustomCopier( + inst, [&](HloInstruction* leaf, const ShapeIndex& leaf_index, + HloComputation* comp) { + if (!ContainsKey(change.second, + ShapeUtil::GetMutableSubshape( + inst->mutable_shape(), leaf_index))) { + return leaf; + } + auto converted_shape = + ShapeUtil::ChangeElementType(leaf->shape(), BF16); + UpdateLayout(&converted_shape); + return comp->AddInstruction( + HloInstruction::CreateConvert(converted_shape, leaf)); + })); + for (auto user : users) { + TF_RETURN_IF_ERROR(inst->ReplaceUseWithDifferentShape(user, copy)); + } + if (is_root) { + inst->parent()->set_root_instruction(copy, + /*accept_different_shape=*/true); + } + continue; + } for (const auto& entry : change.second) { auto subshape = entry.first; CHECK_EQ(subshape->element_type(), F32); diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc index 86eb8cb240c..d716e62d467 100644 --- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc +++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc @@ -422,6 +422,35 @@ TEST_F(BFloat16PropagationTest, PropagateThroughFusion) { EXPECT_TRUE(OutputsBF16(b_f1)); } +// Tests that a fusion with a bitcast-convert as its root is changed via adding +// extra convert, instead of changing the type in-place. +TEST_F(BFloat16PropagationTest, FusionWithBitcastConvertRoot) { + auto module = CreateNewVerifiedModule(); + auto builder = HloComputation::Builder(TestName()); + Shape u32_shape = ShapeUtil::MakeShape(U32, {4, 4}); + Shape f32_shape = ShapeUtil::MakeShape(F32, {4, 4}); + + HloInstruction* param = builder.AddInstruction( + HloInstruction::CreateParameter(0, u32_shape, "param")); + + auto builder_f = HloComputation::Builder("fusion"); + HloInstruction* a_f = builder_f.AddInstruction( + HloInstruction::CreateParameter(0, u32_shape, "a")); + HloInstruction* bc_f = builder_f.AddInstruction( + HloInstruction::CreateBitcastConvert(f32_shape, a_f)); + auto comp_f = module->AddEmbeddedComputation(builder_f.Build()); + auto fusion = builder.AddInstruction(HloInstruction::CreateFusion( + f32_shape, HloInstruction::FusionKind::kLoop, {param}, comp_f)); + auto dot = builder.AddInstruction(CreateDot(f32_shape, fusion, fusion)); + + auto computation = module->AddEntryComputation(builder.Build()); + EXPECT_TRUE(PropagatePrecision(module.get())); + + EXPECT_EQ(computation->root_instruction(), dot); + EXPECT_EQ(bc_f->shape(), f32_shape); + EXPECT_TRUE(OutputsBF16(bc_f)); +} + // Tests that changes to BF16 that cannot be propagated outside a fusion are // discarded. TEST_F(BFloat16PropagationTest, DiscardFusionInternalBF16Changes) { From aa5e1db8e84e86830a78e03096a6bff3e81ed170 Mon Sep 17 00:00:00 2001 From: Karthik Muthuraman Date: Tue, 23 Jul 2019 14:48:59 -0700 Subject: [PATCH 0407/3053] changed exports for reciprocal_no_nan(). --- tensorflow/python/ops/math_ops.py | 2 +- tensorflow/tools/api/golden/v1/tensorflow.pbtxt | 4 ---- tensorflow/tools/api/golden/v2/tensorflow.pbtxt | 4 ---- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 807c64c1991..906b4d5fd32 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -4004,7 +4004,7 @@ def polyval(coeffs, x, name=None): p = c + p * x return p -@tf_export("math.reciprocal_no_nan", "reciprocal_no_nan") +@tf_export("math.reciprocal_no_nan") def reciprocal_no_nan(x, name=None): """Performs a safe reciprocal operation, element wise. If a particular element is zero, the reciprocal for that element is diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt index 294efc75ed3..178daad4a2a 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt @@ -1856,10 +1856,6 @@ tf_module { name: "reciprocal" argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } - member_method { - name: "reciprocal_no_nan" - argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " - } member_method { name: "recompute_grad" argspec: "args=[\'f\'], varargs=None, keywords=None, defaults=None" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt index a56e7d0dbe9..33c4610d97b 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt @@ -860,10 +860,6 @@ tf_module { name: "realdiv" argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } - member_method { - name: "reciprocal_no_nan" - argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " - } member_method { name: "recompute_grad" argspec: "args=[\'f\'], varargs=None, keywords=None, defaults=None" From dfb2078212f3e3adea2de71c06cbef692c0989c9 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 14:23:22 -0700 Subject: [PATCH 0408/3053] Keras subclass model wrapper for Graph Regularization. PiperOrigin-RevId: 259610650 --- tensorflow/python/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index dbf32d93e71..d5710eec49e 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -12,6 +12,7 @@ visibility = [ "//tensorflow_models:__subpackages__", "//tensorflow_model_optimization:__subpackages__", "//third_party/py/cleverhans:__subpackages__", + "//third_party/py/neural_structured_learning/keras:__pkg__", "//third_party/py/tensorflow_examples:__subpackages__", "//third_party/py/tf_slim:__subpackages__", # TODO(aselle): to pass open source test. From d597af28200a125c683d77596a4850a6a5953293 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 14:36:06 -0700 Subject: [PATCH 0409/3053] Update core quantiles ops to be consistent with TFT API. PiperOrigin-RevId: 259613399 --- ...f_BoostedTreesFlushQuantileSummaries.pbtxt | 16 +++++ .../kernels/boosted_trees/quantile_ops.cc | 59 +++++++++++++++++++ tensorflow/core/ops/boosted_trees_ops.cc | 14 +++++ .../boosted_trees/quantile_ops_test.py | 36 +++++++++++ .../api/golden/v1/tensorflow.raw_ops.pbtxt | 4 ++ .../api/golden/v2/tensorflow.raw_ops.pbtxt | 4 ++ 6 files changed, 133 insertions(+) create mode 100644 tensorflow/core/api_def/base_api/api_def_BoostedTreesFlushQuantileSummaries.pbtxt diff --git a/tensorflow/core/api_def/base_api/api_def_BoostedTreesFlushQuantileSummaries.pbtxt b/tensorflow/core/api_def/base_api/api_def_BoostedTreesFlushQuantileSummaries.pbtxt new file mode 100644 index 00000000000..bcd7cc5978d --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_BoostedTreesFlushQuantileSummaries.pbtxt @@ -0,0 +1,16 @@ +op { + graph_op_name: "BoostedTreesFlushQuantileSummaries" + visibility: HIDDEN + in_arg { + name: "quantile_stream_resource_handle" + description: <GetAttr(kNumFeaturesName, &num_features_)); + } + + void Compute(OpKernelContext* const context) override { + ResourceHandle handle; + OP_REQUIRES_OK(context, + HandleFromInput(context, kResourceHandleName, &handle)); + core::RefCountPtr stream_resource; + OP_REQUIRES_OK(context, LookupResource(context, handle, &stream_resource)); + // Remove the reference at the end of this scope. + mutex_lock l(*stream_resource->mutex()); + + OpOutputList summaries_output_list; + OP_REQUIRES_OK( + context, context->output_list(kSummariesName, &summaries_output_list)); + + auto do_quantile_summary_gen = [&](const int64 begin, const int64 end) { + // Iterating features. + for (int64 index = begin; index < end; index++) { + QuantileStream* stream = stream_resource->stream(index); + stream->Finalize(); + + const auto summary_list = stream->GetFinalSummary().GetEntryList(); + Tensor* output_t; + const int64 summary_list_size = static_cast(summary_list.size()); + OP_REQUIRES_OK(context, summaries_output_list.allocate( + index, TensorShape({summary_list_size, 4}), + &output_t)); + auto output = output_t->matrix(); + for (auto row = 0; row < summary_list_size; row++) { + const auto& entry = summary_list[row]; + output(row, 0) = entry.value; + output(row, 1) = entry.weight; + output(row, 2) = entry.min_rank; + output(row, 3) = entry.max_rank; + } + } + }; + // TODO(tanzheny): comment on the magic number. + const int64 kCostPerUnit = 500 * num_features_; + const DeviceBase::CpuWorkerThreads& worker_threads = + *context->device()->tensorflow_cpu_worker_threads(); + Shard(worker_threads.num_threads, worker_threads.workers, num_features_, + kCostPerUnit, do_quantile_summary_gen); + } + + private: + int64 num_features_; +}; + +REGISTER_KERNEL_BUILDER( + Name("BoostedTreesFlushQuantileSummaries").Device(DEVICE_CPU), + BoostedTreesFlushQuantileSummariesOp); + class BoostedTreesQuantileStreamResourceAddSummariesOp : public OpKernel { public: explicit BoostedTreesQuantileStreamResourceAddSummariesOp( diff --git a/tensorflow/core/ops/boosted_trees_ops.cc b/tensorflow/core/ops/boosted_trees_ops.cc index b05b2f57898..4e33bcdd644 100644 --- a/tensorflow/core/ops/boosted_trees_ops.cc +++ b/tensorflow/core/ops/boosted_trees_ops.cc @@ -594,6 +594,20 @@ REGISTER_OP("BoostedTreesMakeQuantileSummaries") return Status::OK(); }); +REGISTER_OP("BoostedTreesFlushQuantileSummaries") + .Attr("num_features: int >= 0") + .Input("quantile_stream_resource_handle: resource") + .Output("summaries: num_features * float") + .SetShapeFn([](InferenceContext* c) { + int num_features; + TF_RETURN_IF_ERROR(c->GetAttr("num_features", &num_features)); + for (int i = 0; i < num_features; ++i) { + // the columns are value, weight, min_rank, max_rank. + c->set_output(i, c->MakeShape({c->UnknownDim(), 4})); + } + return Status::OK(); + }); + REGISTER_OP("BoostedTreesQuantileStreamResourceAddSummaries") .Attr("num_features: int >= 0") .Input("quantile_stream_resource_handle: resource") diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py index 0315456447d..bbceb826dea 100644 --- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py +++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py @@ -29,6 +29,7 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import test_util from tensorflow.python.ops import boosted_trees_ops from tensorflow.python.ops import resources +from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_flush_quantile_summaries as flush_quantile_summaries from tensorflow.python.ops.gen_boosted_trees_ops import boosted_trees_quantile_stream_resource_handle_op as resource_handle_op from tensorflow.python.ops.gen_boosted_trees_ops import is_boosted_trees_quantile_stream_resource_initialized as resource_initialized from tensorflow.python.platform import googletest @@ -107,6 +108,41 @@ class QuantileOpsTest(test_util.TensorFlowTestCase): self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval()) self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval()) + def testBasicQuantileBucketsMultipleResourcesAddFlushed(self): + with self.cached_session(): + quantile_accumulator_handle_0 = self.create_resource("floats_0", self.eps, + self.max_elements, 2) + quantile_accumulator_handle_1 = self.create_resource("floats_1", self.eps, + self.max_elements, 2) + resources.initialize_resources(resources.shared_resources()).run() + summaries = boosted_trees_ops.make_quantile_summaries( + [self._feature_0, self._feature_1], self._example_weights, + epsilon=self.eps) + summary_op = boosted_trees_ops.quantile_add_summaries( + quantile_accumulator_handle_0, summaries) + flushed_summaries = flush_quantile_summaries( + quantile_accumulator_handle_0, num_features=2) + + # We are testing whether the flushed summaries output at the previous step + # will give the same expected results by inputting it to add_summaries + summary_op_2 = boosted_trees_ops.quantile_add_summaries( + quantile_accumulator_handle_1, flushed_summaries) + flush_op = boosted_trees_ops.quantile_flush( + quantile_accumulator_handle_1, self.num_quantiles) + buckets = boosted_trees_ops.get_bucket_boundaries( + quantile_accumulator_handle_1, num_features=2) + quantiles = boosted_trees_ops.boosted_trees_bucketize( + [self._feature_0, self._feature_1], buckets) + self.evaluate(summary_op) + self.evaluate(summary_op_2) + self.evaluate(flush_op) + + self.assertAllClose(self._feature_0_boundaries, buckets[0].eval()) + self.assertAllClose(self._feature_1_boundaries, buckets[1].eval()) + + self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval()) + self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval()) + def testBasicQuantileBucketsMultipleResources(self): with self.cached_session() as sess: quantile_accumulator_handle_0 = self.create_resource("float_0", self.eps, diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt index abf0eae4522..473323b088c 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt @@ -500,6 +500,10 @@ tf_module { name: "BoostedTreesExampleDebugOutputs" argspec: "args=[\'tree_ensemble_handle\', \'bucketized_features\', \'logits_dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } + member_method { + name: "BoostedTreesFlushQuantileSummaries" + argspec: "args=[\'quantile_stream_resource_handle\', \'num_features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } member_method { name: "BoostedTreesGetEnsembleStates" argspec: "args=[\'tree_ensemble_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt index abf0eae4522..473323b088c 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt @@ -500,6 +500,10 @@ tf_module { name: "BoostedTreesExampleDebugOutputs" argspec: "args=[\'tree_ensemble_handle\', \'bucketized_features\', \'logits_dimension\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } + member_method { + name: "BoostedTreesFlushQuantileSummaries" + argspec: "args=[\'quantile_stream_resource_handle\', \'num_features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } member_method { name: "BoostedTreesGetEnsembleStates" argspec: "args=[\'tree_ensemble_handle\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " From c37b1fe9f4e18c916da9a80d220b8b657544ca92 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Tue, 23 Jul 2019 15:03:14 -0700 Subject: [PATCH 0410/3053] Renamed prefixes and sgraph --> segment_graph. --- .../tf2tensorrt/convert/convert_graph.cc | 42 +++++++++---------- .../tf2tensorrt/convert/convert_graph.h | 4 +- .../tf2tensorrt/convert/convert_nodes.cc | 16 +++---- .../tf2tensorrt/convert/convert_nodes.h | 2 - .../tf2tensorrt/kernels/trt_engine_op.cc | 10 ++--- .../tf2tensorrt/kernels/trt_engine_op_test.cc | 6 +-- .../tf2tensorrt/utils/funcdef_to_graphdef.cc | 17 ++++---- 7 files changed, 46 insertions(+), 51 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index f83513c07b2..15096961632 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -537,24 +537,24 @@ Status CreateTRTNode(const ConversionParams& params, // Function to construct a funcdef from the segment and add it to the graph. Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, - Graph* sgraph) { - // sgraph is a graph for the segment, to be modified by this function + Graph* segment_graph) { + // segment_graph is a graph for the segment, to be modified by this function // graph is the input graph to be optimized by TRT. GraphConstructorOptions gcopts; - TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(gcopts, segment, sgraph)); + TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(gcopts, segment, segment_graph)); std::map io_nodes; int num_inputs = 0; - for (auto n : sgraph->op_nodes()) { - if (absl::StartsWith(n->name(), prefixes.kInputPHName)) { + for (auto n : segment_graph->op_nodes()) { + if (absl::StartsWith(n->name(), IONamePrefixes::kInputPHName)) { num_inputs++; io_nodes.insert({n->name(), n}); - } else if (absl::StartsWith(n->name(), prefixes.kOutputPHName)) { + } else if (absl::StartsWith(n->name(), IONamePrefixes::kOutputPHName)) { io_nodes.insert({n->name(), n}); } } for (int i = 0; i < num_inputs; ++i) { - auto name = StrCat(prefixes.kInputPHName, i); + auto name = StrCat(IONamePrefixes::kInputPHName, i); auto node = io_nodes[name]; NodeDef nd; NodeDefBuilder node_builder(StrCat(name, "_Arg"), @@ -564,12 +564,12 @@ Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, .Attr("index", i) .Finalize(&nd)); Status s; - auto node_arg = sgraph->AddNode(nd, &s); + auto node_arg = segment_graph->AddNode(nd, &s); if (!s.ok()) { LOG(ERROR) << "Couldn't add _Arg node for " << name; } for (auto edge : node->out_edges()) { - sgraph->AddEdge(node_arg, 0, edge->dst(), edge->dst_input()); + segment_graph->AddEdge(node_arg, 0, edge->dst(), edge->dst_input()); VLOG(1) << "Updating funcdef input " << node_arg->name() << ":" << 0 << " - > " << edge->dst()->name() << ":" << edge->dst_input(); if (!s.ok()) { @@ -577,11 +577,11 @@ Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, << " to " << edge->dst()->name() << ":" << edge->dst_input(); } } - sgraph->RemoveNode(node); + segment_graph->RemoveNode(node); } for (int i = 0; i < io_nodes.size() - num_inputs; ++i) { - auto name = StrCat(prefixes.kOutputPHName, i); + auto name = StrCat(IONamePrefixes::kOutputPHName, i); auto node = io_nodes[name]; NodeDef nd; NodeDefBuilder node_builder(StrCat(name, "_Ret"), @@ -601,30 +601,30 @@ Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, VLOG(3) << nd.DebugString(); } Status s; - auto node_ret = sgraph->AddNode(nd, &s); + auto node_ret = segment_graph->AddNode(nd, &s); if (!s.ok()) { LOG(ERROR) << "Couldn't add _Ret node for " << name; } VLOG(1) << "Update edge from " << edge->src()->name() << ":" << edge->src_output() << " - > " << node_ret->name() << ":" << 0; - sgraph->AddEdge(edge->src(), edge->src_output(), node_ret, 0); - s = sgraph->UpdateEdge(edge->src(), edge->src_output(), node_ret, 0); + segment_graph->AddEdge(edge->src(), edge->src_output(), node_ret, 0); + s = segment_graph->UpdateEdge(edge->src(), edge->src_output(), node_ret, 0); if (!s.ok()) { LOG(ERROR) << "Failed to update edge from " << edge->src()->name() << ":" << edge->src_output() << " - > " << node_ret->name() << ":" << 0; } - sgraph->RemoveNode(node); + segment_graph->RemoveNode(node); } return Status::OK(); } -Status RegisterModifiedGraphToFunctionLibrary(Graph* sgraph, Graph* graph, +Status RegisterModifiedGraphToFunctionLibrary(Graph* segment_graph, Graph* graph, FunctionDefLibrary fdeflib, const string& engine_name) { auto native_segment = fdeflib.add_function(); TF_RETURN_IF_ERROR(GraphToFunctionDef( - *sgraph, StrCat(engine_name, "_native_segment"), native_segment)); + *segment_graph, StrCat(engine_name, "_native_segment"), native_segment)); // Set kIntsonDeviceAttr to true so that all TRTEngineOp outputs are always on // a GPU device as expected. Otherwise, some of the tensors of type DT_INT32 // would be on host if the op generating the tensor has host memory tag set. @@ -696,7 +696,7 @@ Status ConvertAfterShapes(const ConversionParams& params) { if (params.precision_mode != TrtPrecisionMode::INT8 && params.use_calibration) { return errors::InvalidArgument( - "Calibration requires enabling fallback to TF function execution."); + "Calibration with FP32 or FP16 is not supported."); } // Convert graphdef to graph. @@ -758,16 +758,16 @@ Status ConvertAfterShapes(const ConversionParams& params) { curr_engine.use_calibration = params.use_calibration; curr_engine.maximum_cached_engines = params.max_cached_engines; - Graph sgraph(flib); + Graph segment_graph(flib); status = ModifyGraphForFunctionDef(&graph, curr_engine.segment_graph_def, - &sgraph); + &segment_graph); if (!status.ok()) { LOG(WARNING) << "Failed to modify graph as a function " << t << ": " << status; continue; } FunctionDefLibrary fdeflib; - status = RegisterModifiedGraphToFunctionLibrary(&sgraph, &graph, fdeflib, + status = RegisterModifiedGraphToFunctionLibrary(&segment_graph, &graph, fdeflib, curr_engine.engine_name); if (!status.ok()) { diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h index b40bc2ecf9b..62af1af338f 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h @@ -57,9 +57,9 @@ std::pair GetDeviceAndAllocator(const ConversionParams& params, const EngineInfo& engine); Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, - Graph* sgraph); + Graph* segment_graph); -Status RegisterModifiedGraphToFunctionLibrary(Graph* sgraph, Graph* graph, +Status RegisterModifiedGraphToFunctionLibrary(Graph* segment_graph, Graph* graph, FunctionDefLibrary fdeflib, const string& engine_name); } // namespace convert diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc index 3920dad6b48..8419c13a37b 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc @@ -80,10 +80,10 @@ namespace tensorrt { namespace convert { bool IsEngineInput(absl::string_view name) { - return absl::StartsWith(name, prefixes.kInputPHName); + return absl::StartsWith(name, IONamePrefixes::kInputPHName); } bool IsEngineOutput(absl::string_view name) { - return absl::StartsWith(name, prefixes.kOutputPHName); + return absl::StartsWith(name, IONamePrefixes::kOutputPHName); } using absl::StrAppend; @@ -5019,7 +5019,7 @@ Status ConvertGraphDefToEngine( string type_key; if (node_def.op() == "Placeholder") { if (!strings::safe_strto32( // non-absl ok - node_name.c_str() + strlen(prefixes.kInputPHName), &slot_number)) { + node_name.c_str() + strlen(IONamePrefixes::kInputPHName), &slot_number)) { return errors::InvalidArgument("Failed to parse slot number from ", node_name); } @@ -5061,7 +5061,7 @@ Status ConvertGraphDefToEngine( int32 slot_number = -1; if (node_def.op() == "Identity") { if (!strings::safe_strto32( // non-absl ok - node_name.c_str() + strlen(prefixes.kOutputPHName), &slot_number)) { + node_name.c_str() + strlen(IONamePrefixes::kOutputPHName), &slot_number)) { return errors::InvalidArgument("Failed to parse slot number from ", node_name); } @@ -5069,7 +5069,7 @@ Status ConvertGraphDefToEngine( slot_number = node_def.attr().at("index").i(); } else { return errors::InvalidArgument("Node with name ", node_name, - " starting with prefixes.kOutputPHName is " + " starting with IONamePrefixes::kOutputPHName is " "neither Identity nor Retval, instead ", node_def.op()); } @@ -5140,7 +5140,7 @@ Status ConvertSegmentToGraphDef( // Add dummy input/output nodes to the segment graphdef. if (connection.is_input_edge) { - const string node_name = StrCat(prefixes.kInputPHName, connection.port_number); + const string node_name = StrCat(IONamePrefixes::kInputPHName, connection.port_number); if (marker_nodes.count(node_name)) { VLOG(1) << "Reusing input " << node_name << " for the edge " << connection.outside_node_name << ":" @@ -5159,7 +5159,7 @@ Status ConvertSegmentToGraphDef( << " -> " << connection.inside_node_name << ":" << connection.inside_port; } else { - const string node_name = StrCat(prefixes.kOutputPHName, connection.port_number); + const string node_name = StrCat(IONamePrefixes::kOutputPHName, connection.port_number); if (marker_nodes.count(node_name)) { VLOG(1) << "Reusing output " << node_name << " for the edge " << connection.inside_node_name << ":" << connection.inside_port @@ -5198,7 +5198,7 @@ Status ConvertSegmentToGraphDef( auto snode = segment_def->mutable_node(old_to_new_id_map[connection.inside_id]); const string placeholder_name = - StrCat(prefixes.kInputPHName, connection.port_number); + StrCat(IONamePrefixes::kInputPHName, connection.port_number); VLOG(1) << "Updating " << snode->name() << ":" << connection.inside_port << " from " << snode->input(connection.inside_port) << " to " << placeholder_name; diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h index 9dfe8ed3b1d..bac845ce2c4 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h @@ -49,8 +49,6 @@ namespace convert { (NV_TENSORRT_MAJOR == major && NV_TENSORRT_MINOR == minor && \ NV_TENSORRT_PATCH == patch && NV_TENSORRT_BUILD >= build)) -extern const IONamePrefixes prefixes = IONamePrefixes(); - struct EngineConnection { // Constructs a non-control edge. EngineConnection(const string& outside, int out_id, int out_port, diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index 6fccdaa4fe9..ca23f84aead 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -56,8 +56,6 @@ using ::stream_executor::port::StatusOr; // A helper class to call done() when destructed for asynchronous execution. // Helps simultaneous execution of native and TRT engines. -auto prefixes = IONamePrefixes(); - class AsyncHelper : public core::RefCounted { public: AsyncHelper(AsyncOpKernel::DoneCallback done) : done_(done) {} @@ -326,7 +324,7 @@ void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx, calib_res->device_tensors_.at(i).AccessTensor(ctx); CHECK_EQ(t.TotalBytes(), device_tensor->TotalBytes()); input_data.emplace( - StrCat(prefixes.kInputPHName, static_engine_ ? i : input_node_ids_[i]), + StrCat(IONamePrefixes::kInputPHName, static_engine_ ? i : input_node_ids_[i]), data_address); } VLOG(2) << "Filled map for sending"; @@ -469,7 +467,7 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx, for (int i = 0; i < ctx->num_inputs(); i++) { const string input_name = - StrCat(prefixes.kInputPHName, static_engine_ ? i : input_node_ids_[i]); + StrCat(IONamePrefixes::kInputPHName, static_engine_ ? i : input_node_ids_[i]); const int binding_index = cuda_engine->getBindingIndex(input_name.c_str()); if (binding_index == -1) { const string msg = @@ -511,7 +509,7 @@ bool TRTEngineOp::ExecuteTrtEngine(OpKernelContext* ctx, for (int i = 0; i < ctx->num_outputs(); i++) { // Create an output tensor - const string output_name = StrCat(prefixes.kOutputPHName, + const string output_name = StrCat(IONamePrefixes::kOutputPHName, static_engine_ ? i : output_node_ids_[i]); const int binding_index = cuda_engine->getBindingIndex(output_name.c_str()); Tensor* output_tensor = nullptr; @@ -741,7 +739,7 @@ Status TRTEngineOp::AllocateCalibrationResources( "Unsupported data type encountered in input ", i); } cres->device_buffers_.emplace( - StrCat(prefixes.kInputPHName, static_engine_ ? i : input_node_ids_[i]), + StrCat(IONamePrefixes::kInputPHName, static_engine_ ? i : input_node_ids_[i]), std::pair(device_address, device_tensor->TotalBytes())); } cres->calibrator_.reset( diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc index 4eef454f8f3..08330b58bd7 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc @@ -63,10 +63,10 @@ class TRTEngineOpTestBase : public OpsTestBase { TF_ASSERT_OK(s.ToGraphDef(&graph_def)); const string func_name = "myop_native_segment"; Graph* graph = s.graph(); - Graph sgraph(graph->flib_def()); + Graph segment_graph(graph->flib_def()); TF_ASSERT_OK(convert::ModifyGraphForFunctionDef( - graph, graph_def, &sgraph)); - TF_ASSERT_OK(convert::RegisterModifiedGraphToFunctionLibrary(&sgraph, graph, + graph, graph_def, &segment_graph)); + TF_ASSERT_OK(convert::RegisterModifiedGraphToFunctionLibrary(&segment_graph, graph, flib_def_->ToProto(), "myop")); PartialTensorShape shape({-1, -1}); diff --git a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc index 13457ba5fd2..d17f6efc1fc 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc +++ b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc @@ -14,8 +14,6 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.h" -#include "absl/strings/ascii.h" -#include "absl/strings/str_cat.h" #include "tensorflow/core/common_runtime/function.h" #include "tensorflow/core/common_runtime/graph_optimizer.h" #include "tensorflow/core/framework/node_def_builder.h" @@ -23,23 +21,24 @@ limitations under the License. #include "tensorflow/core/graph/graph_constructor.h" #include "tensorflow/core/platform/logging.h" +#include "absl/strings/ascii.h" +#include "absl/strings/str_cat.h" + namespace tensorflow { namespace tensorrt { -auto prefixes = IONamePrefixes(); - string AppendIdToNodeName(const Node* n) { - if (absl::StartsWith(n->name(), prefixes.kInputPHNameLower)) { - return strings::StrCat(prefixes.kInputPHName, n->id()); - } else if (absl::StartsWith(n->name(), prefixes.kOutputPHNameLower)) { - return strings::StrCat(prefixes.kOutputPHName, n->id()); + if (absl::StartsWith(n->name(), IONamePrefixes::kInputPHNameLower)) { + return strings::StrCat(IONamePrefixes::kInputPHName, n->id()); + } else if (absl::StartsWith(n->name(), IONamePrefixes::kOutputPHNameLower)) { + return strings::StrCat(IONamePrefixes::kOutputPHName, n->id()); } return strings::StrCat("n", n->id()); } void ToGraphDefWithIOPrefix(const Graph* g, GraphDef* gdef) { // This is the same function as in function.cc. However, it uses the - // name mapping above, which retains IO prefixes (prefixes.kInputPHName etc) + // name mapping above, which retains IO prefixes (IONamePrefixes::kInputPHName etc) gtl::InlinedVector inputs; gdef->Clear(); *gdef->mutable_versions() = g->versions(); From eea51e6235bcc6d22528a351cd768643a88e2654 Mon Sep 17 00:00:00 2001 From: Sundeep Gottipati Date: Tue, 23 Jul 2019 14:50:50 -0700 Subject: [PATCH 0411/3053] Improve the sorting test by comparing pairs of different object types with "<". PiperOrigin-RevId: 259616602 --- .../feature_column/feature_column_v2_test.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py index 3391badb4e9..5b4c26308f6 100644 --- a/tensorflow/python/feature_column/feature_column_v2_test.py +++ b/tensorflow/python/feature_column/feature_column_v2_test.py @@ -91,15 +91,26 @@ class BaseFeatureColumnForTests(fc.FeatureColumn): class SortableFeatureColumnTest(test.TestCase): - def test_sort_columns_by_name(self): + def test_sort_columns_by_string_representation(self): # These should be sorted lexicographically based on their string # representations. For FeatureColumns, this looks like # '<__main__.FeatureColumn object at ...>'. - a = fc.numeric_column('first') # '<__main__.NumericColumn object at 0xa>' - b = fc.numeric_column('second') # '<__main__.NumericColumn object at 0xb>' + a = fc.numeric_column('first') # '<__main__.NumericColumn ...>' + b = fc.numeric_column('second') # '<__main__.NumericColumn ...>' c = fc_old._numeric_column('third') # '<__main__._NumericColumn ...>' - self.assertAllEqual(sorted(['d', c, b, a, '0']), ['0', a, b, c, 'd']) + + sorted_sequence = ['0', a, b, c, 'd'] + reversed_sequence = sorted_sequence[::-1] + self.assertAllEqual(sorted(reversed_sequence), sorted_sequence) + + # pylint: disable=g-generic-assert + self.assertTrue(a < b) # V2 < V2 feature columns. + self.assertTrue(a < c) # V2 < V1 feature columns. + self.assertFalse(c < a) # V1 < V2 feature columns. + self.assertTrue('0' < a) # string < V2 feature column. + self.assertTrue(a < 'd') # V2 feature column < string. + # pylint: enable=g-generic-assert class LazyColumnTest(test.TestCase): From 07a6725462ac030eddfd7fb9bed8c299482d0f57 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Tue, 23 Jul 2019 15:01:15 -0700 Subject: [PATCH 0412/3053] Simplify test for importing GraphDef with a custom operation This makes the test shorter and focused exactly on what it is supposed to test. PiperOrigin-RevId: 259618646 --- .../graph-custom-operation.pbtxt | 2169 +---------------- 1 file changed, 19 insertions(+), 2150 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt index 82146716fff..74984c35480 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt @@ -1,209 +1,8 @@ # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s node { - name: "Placeholder" - op: "Placeholder" - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "shape" - value { - shape { - unknown_rank: true - } - } - } -} -node { - name: "Placeholder_1" - op: "Placeholder" - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "shape" - value { - shape { - unknown_rank: true - } - } - } -} -node { - name: "input0" - op: "TPUReplicatedInput" - input: "Placeholder" - attr { - key: "N" - value { - i: 1 - } - } - attr { - key: "T" - value { - type: DT_FLOAT - } - } -} -node { - name: "input1" - op: "TPUReplicatedInput" - input: "Placeholder_1" - attr { - key: "N" - value { - i: 1 - } - } - attr { - key: "T" - value { - type: DT_FLOAT - } - } -} -node { - name: "cluster/pivot" - op: "NoOp" -} -node { - name: "TPUReplicateMetadata" - op: "TPUReplicateMetadata" - input: "^cluster/pivot" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "computation_shape" - value { - list { - } - } - } - attr { - key: "device_assignment" - value { - list { - } - } - } - attr { - key: "host_compute_core" - value { - list { - } - } - } - attr { - key: "num_cores_per_replica" - value { - i: 1 - } - } - attr { - key: "num_replicas" - value { - i: 1 - } - } - attr { - key: "topology" - value { - s: "" - } - } - attr { - key: "use_tpu" - value { - b: true - } - } -} -node { - name: "replicated_input_0" - op: "Identity" - input: "input0" - input: "^TPUReplicateMetadata" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "replicated_input_1" - op: "Identity" - input: "input1" - input: "^TPUReplicateMetadata" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/maximum_iterations" + name: "Constant" op: "Const" - input: "^TPUReplicateMetadata" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 10 - } - } - } -} -node { - name: "while/iteration_counter" - op: "Const" - input: "^TPUReplicateMetadata" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } attr { key: "dtype" value { @@ -223,1968 +22,38 @@ node { } } node { - name: "while/Enter" - op: "Enter" - input: "while/iteration_counter" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "while/while_context" - } - } - attr { - key: "is_constant" - value { - b: false - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "while/Enter_1" - op: "Enter" - input: "replicated_input_0" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "while/while_context" - } - } - attr { - key: "is_constant" - value { - b: false - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "while/Enter_2" - op: "Enter" - input: "replicated_input_1" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "while/while_context" - } - } - attr { - key: "is_constant" - value { - b: false - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "while/Merge" - op: "Merge" - input: "while/Enter" - input: "while/NextIteration" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Merge_1" - op: "Merge" - input: "while/Enter_1" - input: "while/NextIteration_1" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Merge_2" - op: "Merge" - input: "while/Enter_2" - input: "while/NextIteration_2" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Less/Enter" - op: "Enter" - input: "while/maximum_iterations" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "while/while_context" - } - } - attr { - key: "is_constant" - value { - b: true - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "while/Less" - op: "Less" - input: "while/Merge" - input: "while/Less/Enter" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/less_than_5_If8q4vKg9jA" - op: "less_than_5_If8q4vKg9jA" - input: "while/Merge_1" - input: "^while/Merge" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/LogicalAnd" - op: "LogicalAnd" - input: "while/Less" - input: "while/less_than_5_If8q4vKg9jA" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/LoopCond" - op: "LoopCond" - input: "while/LogicalAnd" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Switch" - op: "Switch" - input: "while/Merge" - input: "while/LoopCond" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_class" - value { - list { - s: "loc:@while/Merge" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Switch_1" - op: "Switch" - input: "while/Merge_1" - input: "while/LoopCond" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@while/Merge_1" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Switch_2" - op: "Switch" - input: "while/Merge_2" - input: "while/LoopCond" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@while/Merge_2" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Identity" - op: "Identity" - input: "while/Switch:1" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Identity_1" - op: "Identity" - input: "while/Switch_1:1" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Identity_2" - op: "Identity" - input: "while/Switch_2:1" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/add/y" - op: "Const" - input: "^while/Identity" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 1 - } - } - } -} -node { - name: "while/add" - op: "Add" - input: "while/Identity" - input: "while/add/y" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/add_1/y" - op: "Const" - input: "^while/Identity" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_FLOAT - tensor_shape { - } - float_val: 1 - } - } - } -} -node { - name: "while/add_1" - op: "Add" - input: "while/Identity_1" - input: "while/add_1/y" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/mul_2_Da30D05wlPU" - op: "mul_2_Da30D05wlPU" - input: "while/Identity_1" - input: "while/Identity_2" - input: "^while/Identity" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/NextIteration" - op: "NextIteration" - input: "while/add" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/NextIteration_1" - op: "NextIteration" - input: "while/add_1" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/NextIteration_2" - op: "NextIteration" - input: "while/mul_2_Da30D05wlPU" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Exit" - op: "Exit" - input: "while/Switch" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Exit_1" - op: "Exit" - input: "while/Switch_1" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Exit_2" - op: "Exit" - input: "while/Switch_2" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/Shape" - op: "Shape" - input: "while/Exit_2" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "out_type" - value { - type: DT_INT32 - } - } -} -node { - name: "gradients/grad_ys_0" - op: "Const" - input: "^TPUReplicateMetadata" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_FLOAT - tensor_shape { - } - float_val: 1 - } - } - } -} -node { - name: "gradients/Fill" - op: "Fill" - input: "gradients/Shape" - input: "gradients/grad_ys_0" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "index_type" - value { - type: DT_INT32 - } - } -} -node { - name: "gradients/f_count" - op: "Const" - input: "^TPUReplicateMetadata" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 0 - } - } - } -} -node { - name: "gradients/f_count_1" - op: "Enter" - input: "gradients/f_count" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "while/while_context" - } - } - attr { - key: "is_constant" - value { - b: false - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/Merge" - op: "Merge" - input: "gradients/f_count_1" - input: "gradients/NextIteration" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/Switch" - op: "Switch" - input: "gradients/Merge" - input: "while/LoopCond" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/Add/y" - op: "Const" - input: "^while/Identity" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 1 - } - } - } -} -node { - name: "gradients/Add" - op: "Add" - input: "gradients/Switch:1" - input: "gradients/Add/y" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/f_count_2" - op: "Exit" - input: "gradients/Switch" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/b_count" - op: "Const" - input: "^TPUReplicateMetadata" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 1 - } - } - } -} -node { - name: "gradients/b_count_1" - op: "Enter" - input: "gradients/f_count_2" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "gradients/while/while_context" - } - } - attr { - key: "is_constant" - value { - b: false - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/Merge_1" - op: "Merge" - input: "gradients/b_count_1" - input: "gradients/NextIteration_1" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/GreaterEqual/Enter" - op: "Enter" - input: "gradients/b_count" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "gradients/while/while_context" - } - } - attr { - key: "is_constant" - value { - b: true - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/GreaterEqual" - op: "GreaterEqual" - input: "gradients/Merge_1" - input: "gradients/GreaterEqual/Enter" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/b_count_2" - op: "LoopCond" - input: "gradients/GreaterEqual" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/Switch_1" - op: "Switch" - input: "gradients/Merge_1" - input: "gradients/b_count_2" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/Sub" - op: "Sub" - input: "gradients/Switch_1:1" - input: "gradients/GreaterEqual/Enter" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/b_count_3" - op: "Exit" - input: "gradients/Switch_1" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/zeros_like" - op: "ZerosLike" - input: "while/Exit_1" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/Exit_2_grad/b_exit" - op: "Enter" - input: "gradients/Fill" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "gradients/while/while_context" - } - } - attr { - key: "is_constant" - value { - b: false - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/while/Exit_1_grad/b_exit" - op: "Enter" - input: "gradients/zeros_like" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "gradients/while/while_context" - } - } - attr { - key: "is_constant" - value { - b: false - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/while/Switch_2_grad/b_switch" - op: "Merge" - input: "gradients/while/Exit_2_grad/b_exit" - input: "gradients/while/Switch_2_grad_1/NextIteration" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/Merge_2_grad/Switch" - op: "Switch" - input: "gradients/while/Switch_2_grad/b_switch" - input: "gradients/b_count_2" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@gradients/while/Switch_2_grad/b_switch" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/Enter_2_grad/Exit" - op: "Exit" - input: "gradients/while/Merge_2_grad/Switch" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const" - op: "Const" - input: "^cluster/pivot" - attr { - key: "_class" - value { - list { - s: "loc:@while/Identity_1" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 1 - } - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul" - op: "Mul" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const" - input: "while/maximum_iterations" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_class" - value { - list { - s: "loc:@while/Identity_1" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc" - op: "StackV2" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul" - attr { - key: "_class" - value { - list { - s: "loc:@while/Identity_1" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "elem_type" - value { - type: DT_FLOAT - } - } - attr { - key: "stack_name" - value { - s: "" - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter" - op: "Enter" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc" - attr { - key: "T" - value { - type: DT_RESOURCE - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "while/while_context" - } - } - attr { - key: "is_constant" - value { - b: true - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2" - op: "StackPushV2" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter" - input: "while/Identity_1" - input: "^gradients/Add" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "swap_memory" - value { - b: false - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2/Enter" - op: "Enter" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc" - attr { - key: "T" - value { - type: DT_RESOURCE - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "gradients/while/while_context" - } - } - attr { - key: "is_constant" - value { - b: true - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2" - op: "StackPopV2" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2/Enter" - input: "^gradients/Sub" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "elem_type" - value { - type: DT_FLOAT - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const_1" - op: "Const" - input: "^cluster/pivot" - attr { - key: "_class" - value { - list { - s: "loc:@while/Identity_2" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 1 - } - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul_1" - op: "Mul" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const_1" - input: "while/maximum_iterations" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_class" - value { - list { - s: "loc:@while/Identity_2" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc_1" - op: "StackV2" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul_1" - attr { - key: "_class" - value { - list { - s: "loc:@while/Identity_2" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "elem_type" - value { - type: DT_FLOAT - } - } - attr { - key: "stack_name" - value { - s: "" - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter_1" - op: "Enter" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc_1" - attr { - key: "T" - value { - type: DT_RESOURCE - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "while/while_context" - } - } - attr { - key: "is_constant" - value { - b: true - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2_1" - op: "StackPushV2" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter_1" - input: "while/Identity_2" - input: "^gradients/Add" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "swap_memory" - value { - b: false - } - } -} -node { - name: "gradients/NextIteration" - op: "NextIteration" - input: "gradients/Add" - input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2" - input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2_1" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1/Enter" - op: "Enter" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc_1" - attr { - key: "T" - value { - type: DT_RESOURCE - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "gradients/while/while_context" - } - } - attr { - key: "is_constant" - value { - b: true - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1" - op: "StackPopV2" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1/Enter" - input: "^gradients/Sub" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "elem_type" - value { - type: DT_FLOAT - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient" - op: "SymbolicGradient" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1" - input: "gradients/while/Merge_2_grad/Switch:1" - input: "^gradients/Sub" - attr { - key: "Tin" - value { - list { - type: DT_FLOAT - type: DT_FLOAT - type: DT_FLOAT - } - } - } - attr { - key: "Tout" - value { - list { - type: DT_FLOAT - type: DT_FLOAT - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "f" - value { - func { - name: "mul_2_Da30D05wlPU" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - } - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync" - op: "ControlTrigger" - input: "^cluster/pivot" - input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2" - input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/NextIteration_1" - op: "NextIteration" - input: "gradients/Sub" - input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/Switch_2_grad_1/NextIteration" - op: "NextIteration" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient:1" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "NoOp" - op: "NoOp" - input: "^cluster/pivot" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "Identity" - op: "Identity" - input: "gradients/while/Enter_2_grad/Exit" - device: "/device:TPU_REPLICATED_CORE:0" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "output0" - op: "TPUReplicatedOutput" - input: "Identity" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "num_replicas" - value { - i: 1 - } - } -} -node { - name: "TPUCompilationResult" - op: "TPUCompilationResult" - input: "^TPUReplicateMetadata" - attr { - key: "_tpu_compilation_status" - value { - s: "cluster" - } - } -} -node { - name: "output_0_shard_0" - op: "Identity" - input: "output0" - input: "^NoOp" - attr { - key: "T" - value { - type: DT_FLOAT - } - } -} -node { - name: "ConfigureDistributedTPU" - op: "ConfigureDistributedTPU" - device: "/device:TPU_SYSTEM:0" - attr { - key: "embedding_config" - value { - s: "" - } - } - attr { - key: "is_global_init" - value { - b: false - } - } - attr { - key: "tpu_embedding_config" - value { - s: "" - } - } + name: "_tf.foo" + op: "foo" + input: "Constant" } library { function { signature { - name: "mul_2_Da30D05wlPU" + name: "foo" input_arg { - name: "mul_2_da30d05wlpu" - type: DT_FLOAT - } - input_arg { - name: "mul_2_da30d05wlpu1" - type: DT_FLOAT + name: "arg" + type: DT_INT32 } output_arg { - name: "mul_2_da30d05wlpu2" - type: DT_FLOAT - } - } - node_def { - name: "mul/y" - op: "Const" - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_FLOAT - tensor_shape { - dim { - size: 1 - } - dim { - size: 1 - } - } - float_val: 2 - } - } - } - } - node_def { - name: "mul_0" - op: "Mul" - input: "mul_2_da30d05wlpu1" - input: "mul/y:output:0" - attr { - key: "T" - value { - type: DT_FLOAT - } + name: "return_value" + type: DT_INT32 } } ret { - key: "mul_2_da30d05wlpu2" - value: "mul_0:z:0" - } - attr { - key: "_noinline" - value { - b: true - } - } - } - function { - signature { - name: "less_than_5_If8q4vKg9jA" - input_arg { - name: "less_than_5_if8q4vkg9ja" - type: DT_FLOAT - } - output_arg { - name: "less_than_5_if8q4vkg9ja1" - type: DT_BOOL - } - } - node_def { - name: "Less/y" - op: "Const" - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_FLOAT - tensor_shape { - } - float_val: 5 - } - } - } - } - node_def { - name: "Less" - op: "Less" - input: "less_than_5_if8q4vkg9ja" - input: "Less/y:output:0" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - } - ret { - key: "less_than_5_if8q4vkg9ja1" - value: "Less:z:0" - } - attr { - key: "_noinline" - value { - b: true - } + key: "return_value" + value: "arg" } } } versions { - producer: 27 + producer: 62 min_consumer: 12 } -# CHECK: func @main() { -# CHECK: %30:2 = "_tf.less_than_5_If8q4vKg9jA0"(%23#0, %29#2) {_tpu_replicate = "cluster", device = "", name = "while/less_than_5_If8q4vKg9jA"} : (tensor<*xf32>, !_tf.control) -> (tensor<*xi1>, !_tf.control) -# CHECK: %73:2 = "_tf.mul_2_Da30D05wlPU0"(%58#0, %72#0, %47#1) {_tpu_replicate = "cluster", device = "", name = "while/mul_2_Da30D05wlPU"} : (tensor<*xf32>, tensor<*xf32>, !_tf.control) -> (tensor<*xf32>, !_tf.control) -# CHECK: return -# CHECK-NEXT: } -# CHECK: func @less_than_5_If8q4vKg9jA0(%arg0: tensor<*xf32>) -> tensor<*xi1> -# CHECK-NEXT: attributes {tf._noinline = true} { -# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Less/y", value = dense<5.000000e+00> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Less"(%arg0, %0#0) {T = "tfdtype$DT_FLOAT", device = "", name = "Less"} : (tensor<*xf32>, tensor) -> (tensor<*xi1>, !_tf.control) -# CHECK-NEXT: return %1#0 : tensor<*xi1> -# CHECK-NEXT: } -# CHECK: func @mul_2_Da30D05wlPU0(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> -# CHECK-NEXT: attributes {tf._noinline = true} { -# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "mul/y", value = dense<2.000000e+00> : tensor<1x1xf32>} : () -> (tensor<1x1xf32>, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Mul"(%arg1, %0#0) {T = "tfdtype$DT_FLOAT", device = "", name = "mul_0"} : (tensor<*xf32>, tensor<1x1xf32>) -> (tensor<*xf32>, !_tf.control) -# CHECK-NEXT: return %1#0 : tensor<*xf32> -# CHECK-NEXT: } + +# Verify that we can import a custom operation that maps to a function and that +# the names are matching between the function definition and the uses / call +# site (a numerical suffix may be appended). + +# CHECK: "tf.foo0" +# CHECK: func @foo0 From e260c0dbf87434eb86544d54b8f34f4b9f3ac6c5 Mon Sep 17 00:00:00 2001 From: Yasir Modak <42785357+ymodak@users.noreply.github.com> Date: Tue, 23 Jul 2019 15:19:45 -0700 Subject: [PATCH 0413/3053] formatted --- tensorflow/python/ops/image_ops_impl.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index 04c6c5743fb..175c5ae60a2 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -1593,9 +1593,9 @@ def adjust_brightness(image, delta): Usage Example: ```python - >> import tensorflow as tf - >> x = tf.random.normal(shape=(256, 256, 3)) - >> tf.image.adjust_brightness(x, delta=0.1) + import tensorflow as tf + x = tf.random.normal(shape=(256, 256, 3)) + tf.image.adjust_brightness(x, delta=0.1) ``` """ with ops.name_scope(None, 'adjust_brightness', [image, delta]) as name: From f663ace5614000ec1d4be354fa792a9af8e43080 Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Tue, 23 Jul 2019 15:02:22 -0700 Subject: [PATCH 0414/3053] If weights appear in multiple layers of the same model they are tracked separately which results in duplication in `.trainable_weights`. These weights must be deduplicated when training; otherwise some updates will be applied multiple times. PiperOrigin-RevId: 259619019 --- tensorflow/python/keras/engine/base_layer.py | 11 ++++++ tensorflow/python/keras/engine/training.py | 7 ++-- .../python/keras/engine/training_eager.py | 2 +- .../python/keras/engine/training_test.py | 37 +++++++++++++++++++ tensorflow/python/keras/utils/layer_utils.py | 2 +- 5 files changed, 54 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py index c26bf5b79f3..9757a71c5b0 100644 --- a/tensorflow/python/keras/engine/base_layer.py +++ b/tensorflow/python/keras/engine/base_layer.py @@ -2347,6 +2347,17 @@ class Layer(module.Module): serialization_cache)) return fns + @property + def _unique_trainable_weights(self): + """Dedupe trainable weights while maintaining order as much as possible.""" + trainable_weights = self.trainable_weights + output, seen_weights = [], set() + for w in trainable_weights: + if w not in seen_weights: + output.append(w) + seen_weights.add(w) + return output + class TensorFlowOpLayer(Layer): """Wraps a TensorFlow Operation in a Layer. diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py index 4d8051cdfae..eb10f20fb0d 100644 --- a/tensorflow/python/keras/engine/training.py +++ b/tensorflow/python/keras/engine/training.py @@ -366,7 +366,7 @@ class Model(network.Network): self.predict_function = None # Collected trainable weights, sorted in topological order. - self._collected_trainable_weights = self.trainable_weights + self._collected_trainable_weights = self._unique_trainable_weights # Validate all variables were correctly created in distribution scope. if self._distribution_strategy and not self._compile_distribution: @@ -1477,7 +1477,7 @@ class Model(network.Network): # Set metric attributes on model. self._set_metric_attributes() - self._collected_trainable_weights = self.trainable_weights + self._collected_trainable_weights = self._unique_trainable_weights def _update_sample_weight_modes(self, sample_weights=None): """Updates sample weight modes based on training/eval inputs. @@ -1985,7 +1985,8 @@ class Model(network.Network): if not hasattr(self, '_collected_trainable_weights'): return - if len(self.trainable_weights) != len(self._collected_trainable_weights): + if (len(self._unique_trainable_weights) != + len(self._collected_trainable_weights)): logging.log_first_n( logging.WARN, 'Discrepancy between trainable weights and collected' ' trainable weights, did you set `model.trainable`' diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py index 2c182391273..a1470fe4fa8 100644 --- a/tensorflow/python/keras/engine/training_eager.py +++ b/tensorflow/python/keras/engine/training_eager.py @@ -243,7 +243,7 @@ def _process_single_batch(model, else: scaled_total_loss = total_loss if training: - trainable_weights = model.trainable_weights + trainable_weights = model._unique_trainable_weights if trainable_weights: grads = tape.gradient(scaled_total_loss, trainable_weights) if isinstance(model.optimizer, loss_scale_optimizer.LossScaleOptimizer): diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py index 151a3532945..8672abe10d4 100644 --- a/tensorflow/python/keras/engine/training_test.py +++ b/tensorflow/python/keras/engine/training_test.py @@ -41,6 +41,7 @@ from tensorflow.python.keras import metrics as metrics_module from tensorflow.python.keras import testing_utils from tensorflow.python.keras.engine import training_utils from tensorflow.python.keras.callbacks import Callback +from tensorflow.python.keras.optimizer_v2 import gradient_descent from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import sparse_ops @@ -894,6 +895,42 @@ class TrainingTest(keras_parameterized.TestCase): x2 = model.predict(val_a) self.assertAllClose(x1, x2, atol=1e-7) + @keras_parameterized.run_all_keras_modes + def test_weight_deduplication(self): + class WatchingLayer(keras.layers.Layer): + + def __init__(self, dense_to_track): + # This will cause the kernel and bias to be double counted, effectively + # doubling the learning rate if weights are not deduped. + self._kernel = dense_to_track.kernel + self._bias = dense_to_track.bias + super(WatchingLayer, self).__init__() + + inp = keras.layers.Input(shape=(1,)) + dense_layer = keras.layers.Dense(1) + dense_output = dense_layer(inp) # This will build the dense kernel + + # Deterministically set weights to make the test repeatable. + dense_layer.set_weights([np.ones((1, 1)), np.zeros((1,))]) + output = WatchingLayer(dense_layer)(dense_output) + + model = keras.models.Model(inp, output) + + # 0.25 is the edge of the radius of convergence for the double apply case. + # At lr=0.24, the double apply case will very slowly descend while the + # correct case will drop very quickly. + model.compile(loss='mse', optimizer=gradient_descent.SGD(0.24), + run_eagerly=testing_utils.should_run_eagerly()) + + x = np.ones((64 * 2,)) + y = 4.5 * x - 3. + + history = model.fit(x, y, batch_size=64, epochs=2, verbose=2) + + # If the gradient apply is duplicated then the loss after 2 epochs will + # be ~0.15, compared to the correct answer of O(1e-7). + self.assertLess(history.history['loss'][-1], 1e-6) + def test_logs_passed_to_callbacks(self): with self.cached_session(): input_dim = 5 diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py index 9000022bd14..4bd65eafba1 100644 --- a/tensorflow/python/keras/utils/layer_utils.py +++ b/tensorflow/python/keras/utils/layer_utils.py @@ -231,7 +231,7 @@ def print_summary(model, line_length=None, positions=None, print_fn=None): if hasattr(model, '_collected_trainable_weights'): trainable_count = count_params(model._collected_trainable_weights) else: - trainable_count = count_params(model.trainable_weights) + trainable_count = count_params(model._unique_trainable_weights) non_trainable_count = count_params(model.non_trainable_weights) From 1612f951697f1f7dab91e8d352740fe3128fa0cb Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Tue, 23 Jul 2019 15:23:54 -0700 Subject: [PATCH 0415/3053] [INTEL MKL] Fix Conv3D output tensor shape when the tensor is empty. Also fix some existing Clang issues in mkl_conv_ops.cc. --- tensorflow/core/kernels/mkl_conv_ops.cc | 16 +++++------ .../python/kernel_tests/conv_ops_3d_test.py | 28 ++++++++++++++++++- 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc index 14344da0560..e4f8f338205 100644 --- a/tensorflow/core/kernels/mkl_conv_ops.cc +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -24,8 +24,8 @@ limitations under the License. #include #include -#include "mkldnn.hpp" #include "absl/strings/str_join.h" +#include "mkldnn.hpp" #include "tensorflow/core/framework/bounds_check.h" #include "tensorflow/core/framework/numeric_op.h" #include "tensorflow/core/framework/op_kernel.h" @@ -450,17 +450,15 @@ class MklConvOp : public OpKernel { OP_REQUIRES(context, dilations_.size() == 5, errors::InvalidArgument("Dilation rates field must " "specify 5 dimensions")); - OP_REQUIRES(context, - (GetTensorDim(dilations_, data_format_, 'N') == 1 && - GetTensorDim(dilations_, data_format_, 'C') == 1), + OP_REQUIRES(context, (GetTensorDim(dilations_, data_format_, 'N') == 1 && + GetTensorDim(dilations_, data_format_, 'C') == 1), errors::InvalidArgument( "Current implementation does not yet support " "dilations rates in the batch and depth dimensions.")); OP_REQUIRES( - context, - (GetTensorDim(dilations_, data_format_, '0') > 0 && - GetTensorDim(dilations_, data_format_, '1') > 0 && - GetTensorDim(dilations_, data_format_, '2') > 0), + context, (GetTensorDim(dilations_, data_format_, '0') > 0 && + GetTensorDim(dilations_, data_format_, '1') > 0 && + GetTensorDim(dilations_, data_format_, '2') > 0), errors::InvalidArgument("Dilated rates should be larger than 0.")); } } @@ -525,7 +523,7 @@ class MklConvOp : public OpKernel { MklDnnShape dst_mkl_shape; dst_mkl_shape.SetMklTensor(false); AllocateOutputSetMklShape(context, kOutputIndex_Dst, &dst_tensor, - src_tf_shape, dst_mkl_shape); + dst_tf_shape, dst_mkl_shape); // MklConv2D/3D also outputs converted filter as 2nd output. filter_mkl_shape.SetMklTensor(false); diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py index 608ee57ed69..60a8ad466b1 100644 --- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py +++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py @@ -32,7 +32,6 @@ from tensorflow.python.ops import gradients_impl from tensorflow.python.ops import nn_ops import tensorflow.python.ops.nn_grad # pylint: disable=unused-import from tensorflow.python.platform import test -from tensorflow.python.framework import test_util def GetTestConfigs(): @@ -317,6 +316,33 @@ class Conv3DTest(test.TestCase): padding="SAME", expected=expected_output) + def _TestConv3DEmptyTensorOutputShape(self): + """Verifies the output shape of the Conv3D op when output tensor is empty. + + Args: none + """ + input_shape = [0, 112, 112, 112, 32] + filter_shape = [3, 3, 3, 32, 64] + + output_shape = [0, 112, 112, 112, 64] + input_data = 1 + filter_data = 1 + for data_type in self._DtypesToTest(False): + input_tensor = constant_op.constant( + input_data, shape=input_shape, dtype=data_type, name="input") + filter_tensor = constant_op.constant( + filter_data, shape=filter_shape, dtype=data_type, name="filter") + conv = nn_ops.conv3d( + input_tensor, + filter_tensor, + [1, 1, 1, 1, 1], + dilations=[1, 1, 1, 1, 1], + padding='SAME', + data_format='NDHWC', + name="conv") + values = self.evaluate(conv) + self.assertEqual(values.shape, tensor_shape.TensorShape(output_shape)) + def testKernelSmallerThanStride(self): expected_output = [ 0.03703704, 0.11111111, 0.25925926, 0.33333333, 0.7037037, 0.77777778, From 2ed843260a8e029574c5dc8bf07bfd5da799d0f1 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 15:02:45 -0700 Subject: [PATCH 0416/3053] get rid of two parameters constructor of ScopeAnnotation. PiperOrigin-RevId: 259619133 --- .../common_runtime/eager/kernel_and_device.cc | 10 +++---- tensorflow/core/platform/annotation.h | 6 ---- .../internal/scoped_annotation_test.cc | 29 ------------------- 3 files changed, 5 insertions(+), 40 deletions(-) diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc index eb7b1b7eb23..3492ddf7781 100644 --- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc +++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc @@ -319,14 +319,14 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container, // 'ScopedActivity' will trace the OpKernel scheduling time on host. profiler::TraceMe activity( [&] { - return strings::StrCat( - op_name, ":", kernel_->type_string(), - "#id=", step_container ? step_container->step_id() : 0, - ",device=", device_->name(), ",async=false#"); + return absl::StrCat(op_name, ":", kernel_->type_string(), "#id=", + step_container ? step_container->step_id() : 0, + ",device=", device_->name(), ",async=false#"); }, profiler::TraceMeLevel::kInfo); // 'ScopedAnnotation' will trace the OpKernel execution time on device. - tracing::ScopedAnnotation annotation(op_name, kernel_->type_string()); + tracing::ScopedAnnotation annotation( + [&]() { return absl::StrCat(op_name, ":", kernel_->type_string()); }); device_->Compute(kernel_.get(), &context); } else { profiler::TraceMe activity( diff --git a/tensorflow/core/platform/annotation.h b/tensorflow/core/platform/annotation.h index 660767eec25..3648a7e9ee2 100644 --- a/tensorflow/core/platform/annotation.h +++ b/tensorflow/core/platform/annotation.h @@ -114,12 +114,6 @@ class ScopedAnnotation { } } - // Deprecated: use the lambda version if you want to concatenate strings as - // annotation on the fly. - ScopedAnnotation(absl::string_view name_part1, absl::string_view name_part2) - : ScopedAnnotation( - [&]() { return StrCat(name_part1, ":", name_part2); }) {} - // Pops the name passed in the constructor from the current annotation. ~ScopedAnnotation() { // TODO(b/137971921): without this memory fence, two presubmit tests will diff --git a/tensorflow/core/profiler/internal/scoped_annotation_test.cc b/tensorflow/core/profiler/internal/scoped_annotation_test.cc index 53164f72fdb..56a5e974107 100644 --- a/tensorflow/core/profiler/internal/scoped_annotation_test.cc +++ b/tensorflow/core/profiler/internal/scoped_annotation_test.cc @@ -75,20 +75,6 @@ void BM_ScopedAnnotationEnabled(int iters, int annotation_size) { BENCHMARK(BM_ScopedAnnotationEnabled)->Arg(8)->Arg(32)->Arg(128); -void BM_ScopedAnnotationEnabled_TwoParts(int iters, int annotation_size) { - testing::StopTiming(); - std::string annotation = GenerateRandomString(annotation_size); - tracing::ScopedAnnotation::Enable(true); - testing::StartTiming(); - for (int i = 0; i < iters; i++) { - tracing::ScopedAnnotation trace(annotation, annotation); - } - testing::StopTiming(); - tracing::ScopedAnnotation::Enable(false); -} - -BENCHMARK(BM_ScopedAnnotationEnabled_TwoParts)->Arg(8)->Arg(32)->Arg(128); - void BM_ScopedAnnotationEnabled_Nested(int iters, int annotation_size) { testing::StopTiming(); std::string annotation = GenerateRandomString(annotation_size); @@ -138,20 +124,5 @@ void BM_ScopedAnnotationEnabled_Adhoc_Lambda(int iters, int annotation_size) { BENCHMARK(BM_ScopedAnnotationEnabled_Adhoc_Lambda)->Arg(8)->Arg(32)->Arg(128); -void BM_ScopedAnnotationEnabled_TwoPartsLambda(int iters, int annotation_size) { - testing::StopTiming(); - std::string annotation = GenerateRandomString(annotation_size); - tracing::ScopedAnnotation::Enable(true); - testing::StartTiming(); - for (int i = 0; i < iters; i++) { - tracing::ScopedAnnotation trace( - [&]() { return absl::StrCat(annotation, ":", annotation); }); - } - testing::StopTiming(); - tracing::ScopedAnnotation::Enable(false); -} - -BENCHMARK(BM_ScopedAnnotationEnabled_TwoPartsLambda)->Arg(8)->Arg(32)->Arg(128); - } // namespace } // namespace tensorflow From 26b8dea943fae55bec9801216f8772700829e219 Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Tue, 23 Jul 2019 15:30:19 -0700 Subject: [PATCH 0417/3053] set use_padded_io as true for param size computation --- tensorflow/core/kernels/cudnn_rnn_ops.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc index 55e8bc134bc..6ca6b47988c 100644 --- a/tensorflow/core/kernels/cudnn_rnn_ops.cc +++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc @@ -1041,7 +1041,7 @@ class CudnnRNNKernelCommon : public OpKernel { num_layers, h_num_units, input_size, /*cell_size=*/c_num_units, /*batch_size=*/0, input_mode, rnn_direction_mode(), rnn_mode(), ToDataType::value, algo_config, dropout(), seed(), - /* state_allocator=*/nullptr, /*use_padded_io=*/false); + /* state_allocator=*/nullptr, /*use_padded_io=*/true); if (!rnn_desc_s.ok()) { return FromExecutorStatus(rnn_desc_s); } From bca5e7385f5eaf59caf9ccf2d093435a0f820c15 Mon Sep 17 00:00:00 2001 From: Sergei Lebedev Date: Tue, 23 Jul 2019 15:07:01 -0700 Subject: [PATCH 0418/3053] Inlined tensor_shape.{scalar,vector,matrix} Explicit constructor call is no less clear and match what we export via the public API. The functions will be removed once all the internal users are migrated. PiperOrigin-RevId: 259620054 --- .../bigtable/python/ops/bigtable_api.py | 2 +- .../batch/categorical_split_handler_test.py | 24 +++++----- .../learner/batch/ordinal_split_handler.py | 7 +-- .../batch/ordinal_split_handler_test.py | 48 +++++++++---------- .../stats_accumulator_ops_test.py | 24 +++++----- .../python/ops/batch_ops_utils.py | 3 +- .../python/ops/stats_accumulator_ops.py | 4 +- .../python/training/functions/gbdt_batch.py | 10 ++-- .../distributions/python/ops/batch_reshape.py | 6 +-- .../distributions/python/ops/binomial.py | 2 +- .../distributions/python/ops/cauchy.py | 2 +- .../distributions/python/ops/deterministic.py | 2 +- .../distributions/python/ops/geometric.py | 2 +- .../distributions/python/ops/gumbel.py | 2 +- .../distributions/python/ops/half_normal.py | 2 +- .../distributions/python/ops/inverse_gamma.py | 2 +- .../distributions/python/ops/logistic.py | 2 +- .../python/ops/negative_binomial.py | 2 +- .../distributions/python/ops/poisson.py | 2 +- .../python/ops/poisson_lognormal.py | 2 +- .../learn/python/learn/estimators/model_fn.py | 2 +- .../contrib/nn/python/ops/alpha_dropout.py | 5 +- .../python/slim/data/parallel_reader_test.py | 6 ++- .../training/python/training/bucket_ops.py | 6 +-- .../kernel_tests/group_by_reducer_test.py | 10 ++-- .../python/data/experimental/ops/batching.py | 2 +- .../kernel_tests/dataset_checkpoint_test.py | 2 +- .../python/data/kernel_tests/dataset_test.py | 2 +- .../python/data/kernel_tests/optional_test.py | 2 +- tensorflow/python/data/ops/dataset_ops.py | 10 ++-- tensorflow/python/data/ops/readers.py | 2 +- tensorflow/python/data/util/sparse_test.py | 43 +++++++++-------- tensorflow/python/data/util/structure_test.py | 35 +++++++------- tensorflow/python/eager/function_test.py | 8 ++-- .../python/feature_column/feature_column.py | 4 +- .../feature_column/feature_column_v2.py | 5 +- tensorflow/python/framework/common_shapes.py | 2 +- .../python/framework/common_shapes_test.py | 40 ++++++++-------- .../framework/function_def_to_graph_test.py | 17 ++++--- tensorflow/python/framework/ops_test.py | 4 +- tensorflow/python/framework/tensor_shape.py | 4 ++ .../python/framework/tensor_shape_test.py | 14 ++---- tensorflow/python/framework/tensor_util.py | 8 ++-- tensorflow/python/grappler/datasets_test.py | 5 +- tensorflow/python/grappler/item_test.py | 2 +- .../kernel_tests/control_flow_ops_py_test.py | 2 +- .../python/kernel_tests/list_ops_test.py | 7 +-- .../kernel_tests/tensor_array_ops_test.py | 6 +-- .../python/ops/accumulate_n_benchmark.py | 2 +- tensorflow/python/ops/array_ops.py | 3 +- tensorflow/python/ops/data_flow_ops.py | 4 +- .../python/ops/distributions/bernoulli.py | 2 +- tensorflow/python/ops/distributions/beta.py | 2 +- .../python/ops/distributions/categorical.py | 2 +- tensorflow/python/ops/distributions/gamma.py | 2 +- .../python/ops/distributions/laplace.py | 2 +- tensorflow/python/ops/distributions/normal.py | 2 +- .../python/ops/distributions/student_t.py | 2 +- .../python/ops/distributions/uniform.py | 2 +- tensorflow/python/ops/lookup_ops.py | 2 +- tensorflow/python/ops/nn_ops.py | 5 +- tensorflow/python/ops/tensor_array_ops.py | 4 +- tensorflow/python/ops/while_v2.py | 3 +- 63 files changed, 223 insertions(+), 221 deletions(-) diff --git a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py index 4f1d7990ce6..e55c0dc7806 100644 --- a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py +++ b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py @@ -476,7 +476,7 @@ class BigtableTable(object): if tensor_type != dtypes.string: raise ValueError("Not all elements of the dataset were `tf.string`") for shape in nest.flatten(dataset_ops.get_legacy_output_shapes(dataset)): - if not shape.is_compatible_with(tensor_shape.scalar()): + if not shape.is_compatible_with(tensor_shape.TensorShape([])): raise ValueError("Not all elements of the dataset were scalars") if len(column_families) != len(columns): raise ValueError("len(column_families) != len(columns)") diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py index 386dc19fc7b..04dec603667 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py +++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py @@ -60,8 +60,8 @@ class EqualitySplitHandlerTest(test_util.TensorFlowTestCase): indices = [[0, 0], [0, 1], [2, 0], [3, 0]] values = array_ops.constant([1, 2, 2, 1], dtype=dtypes.int64) - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = categorical_split_handler.EqualitySplitHandler( @@ -183,8 +183,8 @@ class EqualitySplitHandlerTest(test_util.TensorFlowTestCase): indices = [[0, 0], [1, 0], [2, 0], [3, 0]] values = array_ops.constant([1, 2, 1, 2], dtype=dtypes.int64) - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = categorical_split_handler.EqualitySplitHandler( @@ -294,8 +294,8 @@ class EqualitySplitHandlerTest(test_util.TensorFlowTestCase): indices = [[0, 0], [0, 1], [2, 0], [3, 0]] values = array_ops.constant([1, 2, 2, 1], dtype=dtypes.int64) - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = categorical_split_handler.EqualitySplitHandler( @@ -489,8 +489,8 @@ class EqualitySplitHandlerTest(test_util.TensorFlowTestCase): indices = constant_op.constant_v1([], dtype=dtypes.int64, shape=[0, 2]) values = constant_op.constant_v1([], dtype=dtypes.int64) - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = categorical_split_handler.EqualitySplitHandler( @@ -537,8 +537,8 @@ class EqualitySplitHandlerTest(test_util.TensorFlowTestCase): indices = [[0, 0], [0, 1], [2, 0], [3, 0]] values = array_ops.constant([1, 2, 2, 1], dtype=dtypes.int64) - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = categorical_split_handler.EqualitySplitHandler( @@ -591,8 +591,8 @@ class EqualitySplitHandlerTest(test_util.TensorFlowTestCase): indices = [[0, 0], [0, 1], [2, 0]] values = array_ops.constant([1, 2, 2], dtype=dtypes.int64) - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = categorical_split_handler.EqualitySplitHandler( diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py index 0e6a9f8f3a0..75881945fde 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py +++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py @@ -75,7 +75,6 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import function from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor -from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops @@ -261,8 +260,7 @@ class DenseSplitHandler(InequalitySplitHandler): def make_splits(self, stamp_token, next_stamp_token, class_id): """Create the best split using the accumulated stats and flush the state.""" - if (self._gradient_shape == tensor_shape.scalar() and - self._hessian_shape == tensor_shape.scalar()): + if (self._gradient_shape.rank == 0 and self._hessian_shape.rank == 0): handler = make_dense_split_scalar else: handler = make_dense_split_tensor @@ -441,8 +439,7 @@ class SparseSplitHandler(InequalitySplitHandler): def make_splits(self, stamp_token, next_stamp_token, class_id): """Create the best split using the accumulated stats and flush the state.""" - if (self._gradient_shape == tensor_shape.scalar() and - self._hessian_shape == tensor_shape.scalar()): + if self._gradient_shape.rank == 0 and self._hessian_shape.rank == 0: handler = make_sparse_split_scalar else: handler = make_sparse_split_tensor diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py index 4a1b528646e..d41463d002f 100644 --- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py +++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py @@ -63,8 +63,8 @@ class DenseSplitHandlerTest(test_util.TensorFlowTestCase): partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32) class_id = -1 - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) split_handler = ordinal_split_handler.DenseSplitHandler( l1_regularization=0.1, l2_regularization=1., @@ -197,8 +197,8 @@ class DenseSplitHandlerTest(test_util.TensorFlowTestCase): partition_ids = array_ops.constant([1, 1, 1, 2], dtype=dtypes.int32) class_id = -1 - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) split_handler = ordinal_split_handler.DenseSplitHandler( l1_regularization=0.1, l2_regularization=1., @@ -333,8 +333,8 @@ class DenseSplitHandlerTest(test_util.TensorFlowTestCase): partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32) class_id = -1 - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) split_handler = ordinal_split_handler.DenseSplitHandler( l1_regularization=0.2, l2_regularization=2., @@ -645,8 +645,8 @@ class DenseSplitHandlerTest(test_util.TensorFlowTestCase): hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13]) partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32) - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = ordinal_split_handler.DenseSplitHandler( @@ -720,8 +720,8 @@ class DenseSplitHandlerTest(test_util.TensorFlowTestCase): hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13]) partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32) - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = ordinal_split_handler.DenseSplitHandler( @@ -854,8 +854,8 @@ class DenseSplitHandlerTest(test_util.TensorFlowTestCase): hessians = array_ops.constant([0.12, 0.07, 0.2, 2]) partition_ids = array_ops.constant([0, 0, 0, 1], dtype=dtypes.int32) - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = ordinal_split_handler.DenseSplitHandler( @@ -965,8 +965,8 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase): values = array_ops.constant([0.52, 0.3, 0.52]) sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1]) - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = ordinal_split_handler.SparseSplitHandler( @@ -1088,8 +1088,8 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase): values = array_ops.constant([0.52, 0.3, 0.52]) sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1]) - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = ordinal_split_handler.SparseSplitHandler( @@ -1411,8 +1411,8 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase): values = array_ops.constant([0.52, 0.3, 0.52]) sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1]) - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = ordinal_split_handler.SparseSplitHandler( @@ -1481,8 +1481,8 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase): values = constant_op.constant_v1([], dtype=dtypes.float32) sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1]) - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = ordinal_split_handler.SparseSplitHandler( @@ -1565,8 +1565,8 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase): non_empty_indices, non_empty_values, [4, 2]) non_empty_sparse_column = non_empty_sparse_column.eval(session=sess) - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = ordinal_split_handler.SparseSplitHandler( @@ -1650,8 +1650,8 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase): values = array_ops.constant([0.58]) sparse_column = sparse_tensor.SparseTensor(indices, values, [1, 1]) - gradient_shape = tensor_shape.scalar() - hessian_shape = tensor_shape.scalar() + gradient_shape = tensor_shape.TensorShape([]) + hessian_shape = tensor_shape.TensorShape([]) class_id = -1 split_handler = ordinal_split_handler.SparseSplitHandler( diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py index ba459e8b812..d21a0f16621 100644 --- a/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py +++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/stats_accumulator_ops_test.py @@ -32,8 +32,8 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase): with self.cached_session() as sess: accumulator = stats_accumulator_ops.StatsAccumulator( stamp_token=0, - gradient_shape=tensor_shape.scalar(), - hessian_shape=tensor_shape.scalar()) + gradient_shape=tensor_shape.TensorShape([]), + hessian_shape=tensor_shape.TensorShape([])) with ops.control_dependencies([accumulator.initializer]): op1 = accumulator.add( stamp_token=0, @@ -60,8 +60,8 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase): with self.cached_session() as sess: accumulator = stats_accumulator_ops.StatsAccumulator( stamp_token=0, - gradient_shape=tensor_shape.scalar(), - hessian_shape=tensor_shape.scalar()) + gradient_shape=tensor_shape.TensorShape([]), + hessian_shape=tensor_shape.TensorShape([])) with ops.control_dependencies([accumulator.initializer]): op1 = accumulator.add( stamp_token=0, @@ -89,8 +89,8 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase): with self.cached_session() as sess: accumulator = stats_accumulator_ops.StatsAccumulator( stamp_token=0, - gradient_shape=tensor_shape.scalar(), - hessian_shape=tensor_shape.scalar()) + gradient_shape=tensor_shape.TensorShape([]), + hessian_shape=tensor_shape.TensorShape([])) with ops.control_dependencies([accumulator.initializer]): op1 = accumulator.add( stamp_token=0, @@ -121,8 +121,8 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase): with self.cached_session() as sess: accumulator = stats_accumulator_ops.StatsAccumulator( stamp_token=0, - gradient_shape=tensor_shape.scalar(), - hessian_shape=tensor_shape.scalar()) + gradient_shape=tensor_shape.TensorShape([]), + hessian_shape=tensor_shape.TensorShape([])) with ops.control_dependencies([accumulator.initializer]): op1 = accumulator.add( stamp_token=0, @@ -162,8 +162,8 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase): with self.cached_session() as sess: accumulator = stats_accumulator_ops.StatsAccumulator( stamp_token=0, - gradient_shape=tensor_shape.scalar(), - hessian_shape=tensor_shape.scalar()) + gradient_shape=tensor_shape.TensorShape([]), + hessian_shape=tensor_shape.TensorShape([])) with ops.control_dependencies([accumulator.initializer]): # These will be deleted due to deserialize call. op1 = accumulator.add( @@ -199,8 +199,8 @@ class StatsAccumulatorScalarTest(test_util.TensorFlowTestCase): with self.cached_session() as sess: accumulator = stats_accumulator_ops.StatsAccumulator( stamp_token=0, - gradient_shape=tensor_shape.scalar(), - hessian_shape=tensor_shape.scalar()) + gradient_shape=tensor_shape.TensorShape([]), + hessian_shape=tensor_shape.TensorShape([])) partition, feature, grads, hessians = accumulator._make_summary( partition_ids=[1, 2, 1], feature_ids=[[2, 0], [3, 1], [2, 0]], diff --git a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py index 4dc764f9571..8083d8fac85 100644 --- a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py +++ b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py @@ -25,7 +25,6 @@ import six from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops -from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import array_ops @@ -65,7 +64,7 @@ def _move_tensors(tensors, device): # logic. zero = constant_op.constant(0, dtype=dtypes.int32) with ops.device(None): - if all(tensor.shape == tensor_shape.scalar() for tensor in tensors): + if all(tensor.shape.rank == 0 for tensor in tensors): with ops.device(tensors[0].device): values = array_ops.stack(tensors) with ops.device(device): diff --git a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py index 1f6bbbf5740..62d0d0821b2 100644 --- a/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py +++ b/tensorflow/contrib/boosted_trees/python/ops/stats_accumulator_ops.py @@ -23,7 +23,6 @@ from tensorflow.contrib.boosted_trees.python.ops import boosted_trees_ops_loader # pylint: enable=unused-import from tensorflow.contrib.boosted_trees.python.ops import gen_stats_accumulator_ops from tensorflow.python.framework import ops -from tensorflow.python.framework import tensor_shape from tensorflow.python.ops import resources from tensorflow.python.training import saver from tensorflow.python.training.tracking import tracking @@ -134,8 +133,7 @@ class StatsAccumulator(tracking.TrackableResource): self._hessian_shape = hessian_shape self._container = container - if (gradient_shape == tensor_shape.scalar() and - hessian_shape == tensor_shape.scalar()): + if (gradient_shape.rank == 0 and hessian_shape.rank == 0): self._is_scalar = True else: self._is_scalar = False diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py index 4a13da4b5be..3d8b4efd0c1 100644 --- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py +++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py @@ -368,8 +368,8 @@ class GradientBoostedDecisionTreeModel(object): if logits_dimension == 1 or learner_config.multi_class_strategy == ( learner_pb2.LearnerConfig.TREE_PER_CLASS): - self._gradient_shape = tensor_shape.scalar() - self._hessian_shape = tensor_shape.scalar() + self._gradient_shape = tensor_shape.TensorShape([]) + self._hessian_shape = tensor_shape.TensorShape([]) else: if center_bias: raise ValueError("Center bias should be False for multiclass.") @@ -838,8 +838,8 @@ class GradientBoostedDecisionTreeModel(object): # Create steps accumulator. steps_accumulator = stats_accumulator_ops.StatsAccumulator( stamp_token=0, - gradient_shape=tensor_shape.scalar(), - hessian_shape=tensor_shape.scalar(), + gradient_shape=tensor_shape.TensorShape([]), + hessian_shape=tensor_shape.TensorShape([]), name="StepsAccumulator") # Create ensemble stats summaries. summary.scalar("layer_stats/num_examples", num_layer_examples) @@ -1212,7 +1212,7 @@ class GradientBoostedDecisionTreeModel(object): def _get_weights(self, hessian_shape, hessians): """Derives weights to be used based on hessians and multiclass strategy.""" - if hessian_shape == tensor_shape.scalar(): + if hessian_shape.rank == 0: # This is tree per class. weights = hessians elif len(hessian_shape.dims) == 1: diff --git a/tensorflow/contrib/distributions/python/ops/batch_reshape.py b/tensorflow/contrib/distributions/python/ops/batch_reshape.py index d4503790888..4fe4650a182 100644 --- a/tensorflow/contrib/distributions/python/ops/batch_reshape.py +++ b/tensorflow/contrib/distributions/python/ops/batch_reshape.py @@ -191,10 +191,8 @@ class BatchReshape(distribution_lib.Distribution): self.distribution.survival_function, x) def _entropy(self): - return self._call_and_reshape_output( - self.distribution.entropy, - [], - [tensor_shape.scalar()]) + return self._call_and_reshape_output(self.distribution.entropy, [], + [tensor_shape.TensorShape([])]) def _mean(self): return self._call_and_reshape_output(self.distribution.mean) diff --git a/tensorflow/contrib/distributions/python/ops/binomial.py b/tensorflow/contrib/distributions/python/ops/binomial.py index b349e5966dd..cc9e29f2669 100644 --- a/tensorflow/contrib/distributions/python/ops/binomial.py +++ b/tensorflow/contrib/distributions/python/ops/binomial.py @@ -230,7 +230,7 @@ class Binomial(distribution.Distribution): return constant_op.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) @distribution_util.AppendDocstring(_binomial_sample_note) def _log_prob(self, counts): diff --git a/tensorflow/contrib/distributions/python/ops/cauchy.py b/tensorflow/contrib/distributions/python/ops/cauchy.py index c461833b9ae..6b1a022a312 100644 --- a/tensorflow/contrib/distributions/python/ops/cauchy.py +++ b/tensorflow/contrib/distributions/python/ops/cauchy.py @@ -173,7 +173,7 @@ class Cauchy(distribution.Distribution): return constant_op.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) def _sample_n(self, n, seed=None): shape = array_ops.concat([[n], self.batch_shape_tensor()], 0) diff --git a/tensorflow/contrib/distributions/python/ops/deterministic.py b/tensorflow/contrib/distributions/python/ops/deterministic.py index 507c5d36794..0d57a2ddc60 100644 --- a/tensorflow/contrib/distributions/python/ops/deterministic.py +++ b/tensorflow/contrib/distributions/python/ops/deterministic.py @@ -281,7 +281,7 @@ class Deterministic(_BaseDeterministic): return constant_op.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) def _prob(self, x): return math_ops.cast( diff --git a/tensorflow/contrib/distributions/python/ops/geometric.py b/tensorflow/contrib/distributions/python/ops/geometric.py index d62f024aa2a..0b5c47056f3 100644 --- a/tensorflow/contrib/distributions/python/ops/geometric.py +++ b/tensorflow/contrib/distributions/python/ops/geometric.py @@ -132,7 +132,7 @@ class Geometric(distribution.Distribution): return array_ops.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) def _sample_n(self, n, seed=None): # Uniform variates must be sampled from the open-interval `(0, 1)` rather diff --git a/tensorflow/contrib/distributions/python/ops/gumbel.py b/tensorflow/contrib/distributions/python/ops/gumbel.py index 4b50df5b481..341d63f573b 100644 --- a/tensorflow/contrib/distributions/python/ops/gumbel.py +++ b/tensorflow/contrib/distributions/python/ops/gumbel.py @@ -178,7 +178,7 @@ class _Gumbel(distribution.Distribution): return constant_op.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) def _sample_n(self, n, seed=None): # Uniform variates must be sampled from the open-interval `(0, 1)` rather diff --git a/tensorflow/contrib/distributions/python/ops/half_normal.py b/tensorflow/contrib/distributions/python/ops/half_normal.py index f1216370869..1f04090b3ac 100644 --- a/tensorflow/contrib/distributions/python/ops/half_normal.py +++ b/tensorflow/contrib/distributions/python/ops/half_normal.py @@ -150,7 +150,7 @@ class HalfNormal(distribution.Distribution): return constant_op.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) def _sample_n(self, n, seed=None): shape = array_ops.concat([[n], self.batch_shape_tensor()], 0) diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py index 9f1e9d5cd1b..343a7f5a9c0 100644 --- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py +++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py @@ -187,7 +187,7 @@ class InverseGamma(distribution.Distribution): return constant_op.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) @distribution_util.AppendDocstring( """Note: See `tf.random.gamma` docstring for sampling details and diff --git a/tensorflow/contrib/distributions/python/ops/logistic.py b/tensorflow/contrib/distributions/python/ops/logistic.py index 21c9b5a3544..03c5ba2997a 100644 --- a/tensorflow/contrib/distributions/python/ops/logistic.py +++ b/tensorflow/contrib/distributions/python/ops/logistic.py @@ -173,7 +173,7 @@ class Logistic(distribution.Distribution): return constant_op.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) def _sample_n(self, n, seed=None): # Uniform variates must be sampled from the open-interval `(0, 1)` rather diff --git a/tensorflow/contrib/distributions/python/ops/negative_binomial.py b/tensorflow/contrib/distributions/python/ops/negative_binomial.py index 6acfc5746a0..9ab98d17aee 100644 --- a/tensorflow/contrib/distributions/python/ops/negative_binomial.py +++ b/tensorflow/contrib/distributions/python/ops/negative_binomial.py @@ -145,7 +145,7 @@ class NegativeBinomial(distribution.Distribution): return array_ops.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) def _sample_n(self, n, seed=None): # Here we use the fact that if: diff --git a/tensorflow/contrib/distributions/python/ops/poisson.py b/tensorflow/contrib/distributions/python/ops/poisson.py index 3d055085cc7..64c41c57d79 100644 --- a/tensorflow/contrib/distributions/python/ops/poisson.py +++ b/tensorflow/contrib/distributions/python/ops/poisson.py @@ -151,7 +151,7 @@ class Poisson(distribution.Distribution): return constant_op.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) @distribution_util.AppendDocstring(_poisson_sample_note) def _log_prob(self, x): diff --git a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py index 85683e3233d..b23a3231d27 100644 --- a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py +++ b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py @@ -355,7 +355,7 @@ class PoissonLogNormalQuadratureCompound(distribution_lib.Distribution): self.mixture_distribution.logits.shape)[:-1] def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) def _sample_n(self, n, seed=None): # Get ids as a [n, batch_size]-shaped matrix, unless batch_shape=[] then get diff --git a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py index 5ce5c02cc63..fcabbf69425 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/model_fn.py +++ b/tensorflow/contrib/learn/python/learn/estimators/model_fn.py @@ -162,7 +162,7 @@ class ModelFnOps( loss_shape = loss.get_shape() if loss_shape.num_elements() not in (None, 1): raise ValueError('Loss must be scalar: %s.' % loss) - if not loss_shape.is_compatible_with(tensor_shape.scalar()): + if not loss_shape.is_compatible_with(tensor_shape.TensorShape([])): loss = array_ops.reshape(loss, []) # Validate predictions. diff --git a/tensorflow/contrib/nn/python/ops/alpha_dropout.py b/tensorflow/contrib/nn/python/ops/alpha_dropout.py index 2b64a78c223..ad9f223f302 100644 --- a/tensorflow/contrib/nn/python/ops/alpha_dropout.py +++ b/tensorflow/contrib/nn/python/ops/alpha_dropout.py @@ -19,12 +19,11 @@ from __future__ import print_function import numbers from tensorflow.python.framework import ops -from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import tensor_util from tensorflow.python.ops import array_ops -from tensorflow.python.ops import random_ops from tensorflow.python.ops import gen_math_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import random_ops def alpha_dropout(x, keep_prob, noise_shape=None, seed=None, name=None): # pylint: disable=invalid-name @@ -61,7 +60,7 @@ def alpha_dropout(x, keep_prob, noise_shape=None, seed=None, name=None): # pylin keep_prob = ops.convert_to_tensor(keep_prob, dtype=x.dtype, name="keep_prob") - keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar()) + keep_prob.get_shape().assert_has_rank(0) # Do nothing if we know keep_prob == 1 if tensor_util.constant_value(keep_prob) == 1: diff --git a/tensorflow/contrib/slim/python/slim/data/parallel_reader_test.py b/tensorflow/contrib/slim/python/slim/data/parallel_reader_test.py index c457d44e07b..dec5cbc6d22 100644 --- a/tensorflow/contrib/slim/python/slim/data/parallel_reader_test.py +++ b/tensorflow/contrib/slim/python/slim/data/parallel_reader_test.py @@ -144,14 +144,16 @@ class ParallelReaderTest(test.TestCase): capacity=55, min_after_dequeue=28, dtypes=[dtypes_lib.string, dtypes_lib.string], - shapes=[tensor_shape.scalar(), tensor_shape.scalar()]) + shapes=[tensor_shape.TensorShape([]), + tensor_shape.TensorShape([])]) self._verify_read_up_to_out(shared_queue) def testReadUpToFromFIFOQueue(self): shared_queue = data_flow_ops.FIFOQueue( capacity=99, dtypes=[dtypes_lib.string, dtypes_lib.string], - shapes=[tensor_shape.scalar(), tensor_shape.scalar()]) + shapes=[tensor_shape.TensorShape([]), + tensor_shape.TensorShape([])]) self._verify_read_up_to_out(shared_queue) diff --git a/tensorflow/contrib/training/python/training/bucket_ops.py b/tensorflow/contrib/training/python/training/bucket_ops.py index 10f3f88f3eb..7a4abc47f95 100644 --- a/tensorflow/contrib/training/python/training/bucket_ops.py +++ b/tensorflow/contrib/training/python/training/bucket_ops.py @@ -212,7 +212,7 @@ def bucket(tensors, else static_batch_size) bucket_shapes = [ - tensor_shape.vector(maybe_static_batch_size).concatenate(s) + tensor_shape.TensorShape([maybe_static_batch_size]).concatenate(s) for s in bucket_queues[0].shapes ] # top_queue is a PaddingFIFOQueue even if the bucket queues are regular FIFO @@ -222,7 +222,7 @@ def bucket(tensors, top_queue = data_flow_ops.PaddingFIFOQueue( capacity=capacity, dtypes=[dtypes.int32] + types, - shapes=[tensor_shape.scalar()] + bucket_shapes, + shapes=[tensor_shape.TensorShape([])] + bucket_shapes, shared_name=shared_name, name="top_queue") @@ -403,7 +403,7 @@ def bucket_by_sequence_length(input_length, which_bucket = math_ops.cast(which_bucket, dtypes.int32) if shapes is not None: - shapes = [tensor_shape.scalar()] + shapes + shapes = [tensor_shape.TensorShape([])] + shapes _, dequeued = bucket( tensors=[input_length] + tensor_list, diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py index 60b493b5d77..0e9042b2ef8 100644 --- a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py @@ -46,7 +46,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): grouping.group_by_reducer(lambda x: x % 2, reducer)) self.assertDatasetProduces( dataset, - expected_shapes=tensor_shape.scalar(), + expected_shapes=tensor_shape.TensorShape([]), expected_output=[(i - 1) * i, i * i]) def testAverage(self): @@ -65,7 +65,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): lambda x: math_ops.cast(x, dtypes.int64) % 2, reducer)) self.assertDatasetProduces( dataset, - expected_shapes=tensor_shape.scalar(), + expected_shapes=tensor_shape.TensorShape([]), expected_output=[i - 1, i]) def testConcat(self): @@ -81,8 +81,8 @@ class GroupByReducerTest(test_base.DatasetTestBase): grouping.group_by_reducer(lambda x, y: y % 2, reducer)) self.assertDatasetProduces( dataset, - expected_shapes=tensor_shape.scalar(), - expected_output=[b"acegikmoqs" [:i], b"bdfhjlnprt" [:i]]) + expected_shapes=tensor_shape.TensorShape([]), + expected_output=[b"acegikmoqs"[:i], b"bdfhjlnprt"[:i]]) def testSparseSum(self): def _sparse(i): @@ -100,7 +100,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): grouping.group_by_reducer(lambda x: x.values[0] % 2, reducer)) self.assertDatasetProduces( dataset, - expected_shapes=tensor_shape.scalar(), + expected_shapes=tensor_shape.TensorShape([]), expected_output=[(i - 1) * i, i * i]) def testChangingStateShape(self): diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py index a7fd0bf0ccc..5dc2c1c76d8 100644 --- a/tensorflow/python/data/experimental/ops/batching.py +++ b/tensorflow/python/data/experimental/ops/batching.py @@ -244,7 +244,7 @@ class _DenseToSparseBatchDataset(dataset_ops.UnaryDataset): self._batch_size = batch_size self._row_shape = row_shape self._element_spec = sparse_tensor.SparseTensorSpec( - tensor_shape.vector(None).concatenate(self._row_shape), + tensor_shape.TensorShape([None]).concatenate(self._row_shape), dataset_ops.get_legacy_output_types(input_dataset)) if compat.forward_compatible(2019, 8, 3): diff --git a/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py b/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py index 6dcd94ea020..82bdf20a43b 100644 --- a/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py +++ b/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py @@ -142,7 +142,7 @@ class DatasetCheckpointTest(test_base.DatasetTestBase): with ops.Graph().as_default() as g: # Create an empty IteratorResource and restore the Iterator into it. output_types = dtypes.int64 - output_shapes = tensor_shape.scalar() + output_shapes = tensor_shape.TensorShape([]) iterator = iterator_ops.Iterator.from_structure(output_types, output_shapes) restore_op = self._restore_op(iterator._iterator_resource) diff --git a/tensorflow/python/data/kernel_tests/dataset_test.py b/tensorflow/python/data/kernel_tests/dataset_test.py index cbcaa0e5251..348228b4f92 100644 --- a/tensorflow/python/data/kernel_tests/dataset_test.py +++ b/tensorflow/python/data/kernel_tests/dataset_test.py @@ -287,7 +287,7 @@ class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase): dataset_ops.get_structure(dataset), expected_element_structure)) self.assertEqual([dtypes.variant], structure.get_flat_tensor_types(dataset_structure)) - self.assertEqual([tensor_shape.scalar()], + self.assertEqual([tensor_shape.TensorShape([])], structure.get_flat_tensor_shapes(dataset_structure)) # Assert that the `Dataset` survives a round-trip via _from_tensor_list() diff --git a/tensorflow/python/data/kernel_tests/optional_test.py b/tensorflow/python/data/kernel_tests/optional_test.py index 13f0e08c9cc..3ab6717b9c3 100644 --- a/tensorflow/python/data/kernel_tests/optional_test.py +++ b/tensorflow/python/data/kernel_tests/optional_test.py @@ -290,7 +290,7 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase): expected_value_structure)) self.assertEqual([dtypes.variant], structure.get_flat_tensor_types(opt_structure)) - self.assertEqual([tensor_shape.scalar()], + self.assertEqual([tensor_shape.TensorShape([])], structure.get_flat_tensor_shapes(opt_structure)) # All OptionalSpec objects are not compatible with a non-optional diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py index 7216b5b9d38..c60ebe94c31 100644 --- a/tensorflow/python/data/ops/dataset_ops.py +++ b/tensorflow/python/data/ops/dataset_ops.py @@ -3165,7 +3165,7 @@ def _padding_value_to_tensor(value, output_type): TypeError: if the padding value's type does not match `output_type`. """ value = ops.convert_to_tensor(value, name="padding_value") - if not value.shape.is_compatible_with(tensor_shape.scalar()): + if not value.shape.is_compatible_with(tensor_shape.TensorShape([])): raise ValueError("Padding value should be a scalar, but is not: %s" % value) if value.dtype != output_type: raise TypeError("Padding value tensor (%s) does not match output type: %s" % @@ -3229,10 +3229,10 @@ class PaddedBatchDataset(UnaryDataset): drop_remainder, dtype=dtypes.bool, name="drop_remainder") def _padded_shape_to_batch_shape(s): - return tensor_shape.vector( - tensor_util.constant_value(self._batch_size) if smart_cond. - smart_constant_value(self._drop_remainder) else None).concatenate( - tensor_util.constant_value_as_shape(s)) + return tensor_shape.TensorShape([ + tensor_util.constant_value(self._batch_size) + if smart_cond.smart_constant_value(self._drop_remainder) else None + ]).concatenate(tensor_util.constant_value_as_shape(s)) output_shapes = nest.map_structure( _padded_shape_to_batch_shape, self._padded_shapes) diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py index 5ece97fd0dd..dab33fe2a18 100644 --- a/tensorflow/python/data/ops/readers.py +++ b/tensorflow/python/data/ops/readers.py @@ -53,7 +53,7 @@ def _create_or_validate_filenames_dataset(filenames): raise TypeError( "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.") if not dataset_ops.get_legacy_output_shapes(filenames).is_compatible_with( - tensor_shape.scalar()): + tensor_shape.TensorShape([])): raise TypeError( "`filenames` must be a `tf.data.Dataset` of scalar `tf.string` " "elements.") diff --git a/tensorflow/python/data/util/sparse_test.py b/tensorflow/python/data/util/sparse_test.py index 06acf55ab9d..3b9eed128a2 100644 --- a/tensorflow/python/data/util/sparse_test.py +++ b/tensorflow/python/data/util/sparse_test.py @@ -87,64 +87,67 @@ class SparseTest(test.TestCase): "expected": () }, { - "types": tensor_shape.scalar(), + "types": tensor_shape.TensorShape([]), "classes": ops.Tensor, - "expected": tensor_shape.scalar() + "expected": tensor_shape.TensorShape([]) }, { - "types": tensor_shape.scalar(), + "types": tensor_shape.TensorShape([]), "classes": sparse_tensor.SparseTensor, "expected": tensor_shape.unknown_shape() }, { - "types": (tensor_shape.scalar()), + "types": (tensor_shape.TensorShape([])), "classes": (ops.Tensor), - "expected": (tensor_shape.scalar()) + "expected": (tensor_shape.TensorShape([])) }, { - "types": (tensor_shape.scalar()), + "types": (tensor_shape.TensorShape([])), "classes": (sparse_tensor.SparseTensor), "expected": (tensor_shape.unknown_shape()) }, { - "types": (tensor_shape.scalar(), ()), + "types": (tensor_shape.TensorShape([]), ()), "classes": (ops.Tensor, ()), - "expected": (tensor_shape.scalar(), ()) + "expected": (tensor_shape.TensorShape([]), ()) }, { - "types": ((), tensor_shape.scalar()), + "types": ((), tensor_shape.TensorShape([])), "classes": ((), ops.Tensor), - "expected": ((), tensor_shape.scalar()) + "expected": ((), tensor_shape.TensorShape([])) }, { - "types": (tensor_shape.scalar(), ()), + "types": (tensor_shape.TensorShape([]), ()), "classes": (sparse_tensor.SparseTensor, ()), "expected": (tensor_shape.unknown_shape(), ()) }, { - "types": ((), tensor_shape.scalar()), + "types": ((), tensor_shape.TensorShape([])), "classes": ((), sparse_tensor.SparseTensor), "expected": ((), tensor_shape.unknown_shape()) }, { - "types": (tensor_shape.scalar(), (), tensor_shape.scalar()), + "types": (tensor_shape.TensorShape([]), (), + tensor_shape.TensorShape([])), "classes": (ops.Tensor, (), ops.Tensor), - "expected": (tensor_shape.scalar(), (), tensor_shape.scalar()) + "expected": (tensor_shape.TensorShape([]), (), + tensor_shape.TensorShape([])) }, { - "types": (tensor_shape.scalar(), (), tensor_shape.scalar()), - "classes": (sparse_tensor.SparseTensor, (), - sparse_tensor.SparseTensor), + "types": (tensor_shape.TensorShape([]), (), + tensor_shape.TensorShape([])), + "classes": + (sparse_tensor.SparseTensor, (), sparse_tensor.SparseTensor), "expected": (tensor_shape.unknown_shape(), (), tensor_shape.unknown_shape()) }, { - "types": ((), tensor_shape.scalar(), ()), + "types": ((), tensor_shape.TensorShape([]), ()), "classes": ((), ops.Tensor, ()), - "expected": ((), tensor_shape.scalar(), ()) + "expected": ((), tensor_shape.TensorShape([]), ()) }, { - "types": ((), tensor_shape.scalar(), ()), + "types": ((), tensor_shape.TensorShape([]), ()), "classes": ((), sparse_tensor.SparseTensor, ()), "expected": ((), tensor_shape.unknown_shape(), ()) }, diff --git a/tensorflow/python/data/util/structure_test.py b/tensorflow/python/data/util/structure_test.py index 8781a1933c5..c8fdfed740f 100644 --- a/tensorflow/python/data/util/structure_test.py +++ b/tensorflow/python/data/util/structure_test.py @@ -525,40 +525,43 @@ class StructureTest(test_base.DatasetTestBase, parameterized.TestCase, structure.from_tensor_list(s_2, flat_s_1) @parameterized.named_parameters( - ("Tensor", dtypes.float32, tensor_shape.scalar(), ops.Tensor, - tensor_spec.TensorSpec([], dtypes.float32)), - ("SparseTensor", dtypes.int32, tensor_shape.matrix( - 2, 2), sparse_tensor.SparseTensor, + ("Tensor", dtypes.float32, tensor_shape.TensorShape( + []), ops.Tensor, tensor_spec.TensorSpec([], dtypes.float32)), + ("SparseTensor", dtypes.int32, tensor_shape.TensorShape( + [2, 2]), sparse_tensor.SparseTensor, sparse_tensor.SparseTensorSpec([2, 2], dtypes.int32)), - ("TensorArray_0", dtypes.int32, tensor_shape.as_shape( - [None, True, 2, 2]), tensor_array_ops.TensorArray, + ("TensorArray_0", dtypes.int32, + tensor_shape.TensorShape([None, True, 2, 2 + ]), tensor_array_ops.TensorArray, tensor_array_ops.TensorArraySpec( [2, 2], dtypes.int32, dynamic_size=None, infer_shape=True)), - ("TensorArray_1", dtypes.int32, tensor_shape.as_shape( - [True, None, 2, 2]), tensor_array_ops.TensorArray, + ("TensorArray_1", dtypes.int32, + tensor_shape.TensorShape([True, None, 2, 2 + ]), tensor_array_ops.TensorArray, tensor_array_ops.TensorArraySpec( [2, 2], dtypes.int32, dynamic_size=True, infer_shape=None)), - ("TensorArray_2", dtypes.int32, tensor_shape.as_shape( - [True, False, 2, 2]), tensor_array_ops.TensorArray, + ("TensorArray_2", dtypes.int32, + tensor_shape.TensorShape([True, False, 2, 2 + ]), tensor_array_ops.TensorArray, tensor_array_ops.TensorArraySpec( [2, 2], dtypes.int32, dynamic_size=True, infer_shape=False)), - ("RaggedTensor", dtypes.int32, tensor_shape.matrix( - 2, None), ragged_tensor.RaggedTensorSpec([2, None], dtypes.int32, 1), + ("RaggedTensor", dtypes.int32, tensor_shape.TensorShape([2, None]), + ragged_tensor.RaggedTensorSpec([2, None], dtypes.int32, 1), ragged_tensor.RaggedTensorSpec([2, None], dtypes.int32, 1)), ("Nested", { "a": dtypes.float32, "b": (dtypes.int32, dtypes.string) }, { - "a": tensor_shape.scalar(), - "b": (tensor_shape.matrix(2, 2), tensor_shape.scalar()) + "a": tensor_shape.TensorShape([]), + "b": (tensor_shape.TensorShape([2, 2]), tensor_shape.TensorShape([])) }, { "a": ops.Tensor, "b": (sparse_tensor.SparseTensor, ops.Tensor) }, { "a": tensor_spec.TensorSpec([], dtypes.float32), - "b": (sparse_tensor.SparseTensorSpec([2, 2], dtypes.int32), - tensor_spec.TensorSpec([], dtypes.string)) + "b": (sparse_tensor.SparseTensorSpec( + [2, 2], dtypes.int32), tensor_spec.TensorSpec([], dtypes.string)) }), ) def testConvertLegacyStructure(self, output_types, output_shapes, diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py index 477d6b19227..a922baaa2d4 100644 --- a/tensorflow/python/eager/function_test.py +++ b/tensorflow/python/eager/function_test.py @@ -683,7 +683,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase): compiled = def_function.function(f) var_handle = compiled() self.assertEqual(var_handle.dtype, dtypes.resource) - self.assertEqual(var_handle.shape, tensor_shape.scalar()) + self.assertEqual(var_handle.shape, tensor_shape.TensorShape([])) var_t = resource_variable_ops.read_variable_op(var_handle, dtype=v.dtype) self.assertEqual(var_t.shape, tensor_shape.TensorShape([2, 2])) @@ -760,7 +760,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase): compiled = def_function.function(f) var_handle = compiled() self.assertEqual(var_handle.dtype, dtypes.resource) - self.assertEqual(var_handle.shape, tensor_shape.scalar()) + self.assertEqual(var_handle.shape, tensor_shape.TensorShape([])) var_t = resource_variable_ops.read_variable_op(var_handle, dtype=v.dtype) self.assertEqual(var_t.shape, tensor_shape.TensorShape([2, 2])) @@ -790,14 +790,14 @@ class FunctionTest(test.TestCase, parameterized.TestCase): def f(): tl, value = list_ops.tensor_list_pop_back( tensor_list, element_dtype=dtypes.float32) - self.assertEqual(value.shape, tensor_shape.scalar()) + self.assertEqual(value.shape, tensor_shape.TensorShape([])) return tl compiled = def_function.function(f) output_tensor_list = compiled() _, value = list_ops.tensor_list_pop_back( output_tensor_list, element_dtype=dtypes.float32) - self.assertEqual(value.shape, tensor_shape.scalar()) + self.assertEqual(value.shape, tensor_shape.TensorShape([])) @test_util.run_in_graph_and_eager_modes def testDefunForcesResourceVariables(self): diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py index 7445556d421..f783f219034 100644 --- a/tensorflow/python/feature_column/feature_column.py +++ b/tensorflow/python/feature_column/feature_column.py @@ -2462,7 +2462,7 @@ class _EmbeddingColumn( @property def _variable_shape(self): if not hasattr(self, '_shape'): - self._shape = tensor_shape.vector(self.dimension) + self._shape = tensor_shape.TensorShape([self.dimension]) return self._shape def _get_dense_tensor_internal(self, @@ -2573,7 +2573,7 @@ class _SharedEmbeddingColumn( @property def _variable_shape(self): if not hasattr(self, '_shape'): - self._shape = tensor_shape.vector(self.dimension) + self._shape = tensor_shape.TensorShape([self.dimension]) return self._shape def _get_dense_tensor_internal(self, diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py index d232565a6b3..260d0a2183c 100644 --- a/tensorflow/python/feature_column/feature_column_v2.py +++ b/tensorflow/python/feature_column/feature_column_v2.py @@ -3134,7 +3134,7 @@ class EmbeddingColumn( @property def variable_shape(self): """See `DenseColumn` base class.""" - return tensor_shape.vector(self.dimension) + return tensor_shape.TensorShape([self.dimension]) @property @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, @@ -3418,7 +3418,8 @@ class SharedEmbeddingColumn( @property def variable_shape(self): """See `DenseColumn` base class.""" - return tensor_shape.vector(self.shared_embedding_column_creator.dimension) + return tensor_shape.TensorShape( + [self.shared_embedding_column_creator.dimension]) @property def _variable_shape(self): diff --git a/tensorflow/python/framework/common_shapes.py b/tensorflow/python/framework/common_shapes.py index 422bc7abf32..11612295d92 100644 --- a/tensorflow/python/framework/common_shapes.py +++ b/tensorflow/python/framework/common_shapes.py @@ -42,7 +42,7 @@ def rank(tensor): def scalar_shape(unused_op): """Shape function for ops that output a scalar value.""" - return [tensor_shape.scalar()] + return [tensor_shape.TensorShape([])] def unchanged_shape(op): diff --git a/tensorflow/python/framework/common_shapes_test.py b/tensorflow/python/framework/common_shapes_test.py index 24e079eefbe..5cc48b4f42b 100644 --- a/tensorflow/python/framework/common_shapes_test.py +++ b/tensorflow/python/framework/common_shapes_test.py @@ -63,11 +63,11 @@ class CommonShapesTest(test_util.TensorFlowTestCase): self.assertEqual(expected, common_shapes.broadcast_shape(shape2, shape1)) def testBroadcast_one_dimension(self): - s1 = tensor_shape.vector(5) - s2 = tensor_shape.vector(7) + s1 = tensor_shape.TensorShape([5]) + s2 = tensor_shape.TensorShape([7]) unknown = tensor_shape.unknown_shape() - scalar = tensor_shape.scalar() + scalar = tensor_shape.TensorShape([]) expanded_scalar = tensor_shape.TensorShape([1]) # Tensors with same shape should have the same broadcast result. @@ -90,13 +90,13 @@ class CommonShapesTest(test_util.TensorFlowTestCase): def testBroadcast_many_dimensions(self): unknown = tensor_shape.unknown_shape() - shape_0 = tensor_shape.scalar() - shape_1 = tensor_shape.vector(1) - shape_4 = tensor_shape.vector(4) - shape_1x4 = tensor_shape.matrix(1, 4) - shape_4x1 = tensor_shape.matrix(4, 1) - shape_3x4 = tensor_shape.matrix(3, 4) - shape_4x3 = tensor_shape.matrix(4, 3) + shape_0 = tensor_shape.TensorShape([]) + shape_1 = tensor_shape.TensorShape([1]) + shape_4 = tensor_shape.TensorShape([4]) + shape_1x4 = tensor_shape.TensorShape([1, 4]) + shape_4x1 = tensor_shape.TensorShape([4, 1]) + shape_3x4 = tensor_shape.TensorShape([3, 4]) + shape_4x3 = tensor_shape.TensorShape([4, 3]) # Tensors with same shape should have the same broadcast result. for shape in ( @@ -113,7 +113,7 @@ class CommonShapesTest(test_util.TensorFlowTestCase): self._assert_broadcast(expected=unknown, shape1=shape, shape2=unknown) self._assert_broadcast(expected=shape_1x4, shape1=shape_4, shape2=shape_1x4) - shape_4x4 = tensor_shape.matrix(4, 4) + shape_4x4 = tensor_shape.TensorShape([4, 4]) self._assert_broadcast(expected=shape_4x4, shape1=shape_4, shape2=shape_4x1) self._assert_broadcast(expected=shape_3x4, shape1=shape_4, shape2=shape_3x4) self._assert_incompatible_broadcast(shape1=shape_4, shape2=shape_4x3) @@ -155,14 +155,14 @@ class CommonShapesTest(test_util.TensorFlowTestCase): def testBroadcast_unknown_dims(self): unknown = tensor_shape.unknown_shape() - shape_0 = tensor_shape.scalar() - shape_1 = tensor_shape.vector(1) + shape_0 = tensor_shape.TensorShape([]) + shape_1 = tensor_shape.TensorShape([1]) # pylint: disable=invalid-name - shape_U = tensor_shape.vector(None) - shape_1xU = tensor_shape.matrix(1, None) - shape_Ux1 = tensor_shape.matrix(None, 1) - shape_4xU = tensor_shape.matrix(4, None) - shape_Ux4 = tensor_shape.matrix(None, 4) + shape_U = tensor_shape.TensorShape([None]) + shape_1xU = tensor_shape.TensorShape([1, None]) + shape_Ux1 = tensor_shape.TensorShape([None, 1]) + shape_4xU = tensor_shape.TensorShape([4, None]) + shape_Ux4 = tensor_shape.TensorShape([None, 4]) # pylint: enable=invalid-name # Tensors with same shape should have the same broadcast result. @@ -183,7 +183,7 @@ class CommonShapesTest(test_util.TensorFlowTestCase): self._assert_broadcast_with_unknown_dims( expected=shape_1xU, shape1=shape_U, shape2=shape_1xU) - shape_UxU = tensor_shape.matrix(None, None) # pylint: disable=invalid-name + shape_UxU = tensor_shape.TensorShape([None, None]) # pylint: disable=invalid-name self._assert_broadcast_with_unknown_dims( expected=shape_UxU, shape1=shape_U, shape2=shape_Ux1) self._assert_broadcast_with_unknown_dims( @@ -200,7 +200,7 @@ class CommonShapesTest(test_util.TensorFlowTestCase): expected=shape_4xU, shape1=shape_Ux1, shape2=shape_4xU) self._assert_broadcast_with_unknown_dims( expected=shape_Ux4, shape1=shape_Ux1, shape2=shape_Ux4) - shape_4x4 = tensor_shape.matrix(4, 4) + shape_4x4 = tensor_shape.TensorShape([4, 4]) self._assert_broadcast_with_unknown_dims( expected=shape_4x4, shape1=shape_4xU, shape2=shape_Ux4) diff --git a/tensorflow/python/framework/function_def_to_graph_test.py b/tensorflow/python/framework/function_def_to_graph_test.py index b557f3ba192..3c58598371c 100644 --- a/tensorflow/python/framework/function_def_to_graph_test.py +++ b/tensorflow/python/framework/function_def_to_graph_test.py @@ -75,15 +75,18 @@ class FunctionDefToGraphTest(test.TestCase): self.assertIsNone(g.outputs[1].shape.dims) # Unknown dims. g = function_def_to_graph.function_def_to_graph( - fdef, input_shapes=[tensor_shape.vector(5), - tensor_shape.vector(5)]) + fdef, + input_shapes=[ + tensor_shape.TensorShape([5]), + tensor_shape.TensorShape([5]) + ]) self.assertSequenceEqual(g.inputs[0].shape.dims, [5]) self.assertSequenceEqual(g.inputs[1].shape.dims, [5]) self.assertSequenceEqual(g.outputs[0].shape.dims, [5]) self.assertSequenceEqual(g.outputs[1].shape.dims, [5]) g = function_def_to_graph.function_def_to_graph( - fdef, input_shapes=[None, tensor_shape.matrix(5, 7)]) + fdef, input_shapes=[None, tensor_shape.TensorShape([5, 7])]) self.assertIsNone(g.inputs[0].shape.dims) self.assertSequenceEqual(g.inputs[1].shape.dims, [5, 7]) self.assertSequenceEqual(g.outputs[0].shape.dims, [5, 7]) @@ -93,7 +96,7 @@ class FunctionDefToGraphTest(test.TestCase): # the number of input args in FunctionDef.signature.input_arg. with self.assertRaises(ValueError): g = function_def_to_graph.function_def_to_graph( - fdef, input_shapes=[tensor_shape.matrix(5, 7)]) + fdef, input_shapes=[tensor_shape.TensorShape([5, 7])]) class FunctionDefToGraphDefTest(test.TestCase): @@ -177,8 +180,10 @@ class FunctionDefToGraphDefTest(test.TestCase): fdef = self._build_function_def() g, _ = function_def_to_graph.function_def_to_graph_def( fdef, - input_shapes=[tensor_shape.scalar(), - tensor_shape.vector(5), None]) + input_shapes=[ + tensor_shape.TensorShape([]), + tensor_shape.TensorShape([5]), None + ]) self.assertEqual("shape" in g.node[0].attr, True) self.assertSequenceEqual( tensor_shape.TensorShape(g.node[0].attr["shape"].shape).as_list(), []) diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py index 2fdc42e1dea..1b272cf5253 100644 --- a/tensorflow/python/framework/ops_test.py +++ b/tensorflow/python/framework/ops_test.py @@ -136,7 +136,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase): a = array_ops.placeholder(dtype=dtypes.float32, shape=[]) b = array_ops.ones([]) c = a + b - self.assertEqual(tensor_shape.scalar(), c.shape) + self.assertEqual(tensor_shape.TensorShape([]), c.shape) @test_util.run_deprecated_v1 def testShapeFunctionError(self): @@ -783,7 +783,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase): self.assertEqual(op.name, "myop") self.assertEqual(op.type, "Identity") self.assertEqual(len(op.outputs), 1) - self.assertEqual(op.outputs[0].shape, tensor_shape.matrix(2, 3)) + self.assertEqual(op.outputs[0].shape, tensor_shape.TensorShape([2, 3])) def testUniqueName(self): g = ops.Graph() diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py index 14fbddabd00..4a26b7224ae 100644 --- a/tensorflow/python/framework/tensor_shape.py +++ b/tensorflow/python/framework/tensor_shape.py @@ -22,6 +22,7 @@ from tensorflow.python import tf2 from tensorflow.python.eager import monitoring from tensorflow.python.framework import dtypes from tensorflow.python.util import compat +from tensorflow.python.util import deprecation from tensorflow.python.util.tf_export import tf_export _TENSORSHAPE_V2_OVERRIDE = None @@ -1238,11 +1239,13 @@ def unknown_shape(rank=None, **kwargs): return TensorShape([Dimension(None)] * rank) +@deprecation.deprecated(None, "Use tf.TensorShape([]).") def scalar(): """Returns a shape representing a scalar.""" return TensorShape([]) +@deprecation.deprecated(None, "Use tf.TensorShape([length]).") def vector(length): """Returns a shape representing a vector. @@ -1255,6 +1258,7 @@ def vector(length): return TensorShape([length]) +@deprecation.deprecated(None, "Use tf.TensorShape([rows, cols]).") def matrix(rows, cols): """Returns a shape representing a matrix. diff --git a/tensorflow/python/framework/tensor_shape_test.py b/tensorflow/python/framework/tensor_shape_test.py index 5fa78f2041a..ccbf5cf9208 100644 --- a/tensorflow/python/framework/tensor_shape_test.py +++ b/tensorflow/python/framework/tensor_shape_test.py @@ -377,14 +377,6 @@ class ShapeTest(test_util.TensorFlowTestCase, parameterized.TestCase): self._testMostSpecificCompatibleShapeHelper([1, 1, 3], [None, 2, 3], [None, None, 3]) - def testHelpers(self): - tensor_shape.TensorShape([]).assert_is_compatible_with( - tensor_shape.scalar()) - tensor_shape.TensorShape([37]).assert_is_compatible_with( - tensor_shape.vector(37)) - tensor_shape.TensorShape( - [94, 43]).assert_is_compatible_with(tensor_shape.matrix(94, 43)) - def testTruedivFails(self): unknown = tensor_shape.Dimension(None) self.assertEqual((unknown // unknown).value, None) @@ -430,9 +422,9 @@ class ShapeTest(test_util.TensorFlowTestCase, parameterized.TestCase): self.assertEqual( "(32, None, 1, 9)", str(tensor_shape.TensorShape([32, None, 1, 9])).replace("?", "None")) - self.assertEqual("()", str(tensor_shape.scalar())) - self.assertEqual("(7,)", str(tensor_shape.vector(7))) - self.assertEqual("(3, 8)", str(tensor_shape.matrix(3, 8))) + self.assertEqual("()", str(tensor_shape.TensorShape([]))) + self.assertEqual("(7,)", str(tensor_shape.TensorShape([7]))) + self.assertEqual("(3, 8)", str(tensor_shape.TensorShape([3, 8]))) self.assertEqual("(4, 5, 2)", str(tensor_shape.TensorShape([4, 5, 2]))) def testAsProto(self): diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py index d957b6b0647..daf4b0977c1 100644 --- a/tensorflow/python/framework/tensor_util.py +++ b/tensorflow/python/framework/tensor_util.py @@ -833,11 +833,11 @@ def constant_value_as_shape(tensor): # pylint: disable=invalid-name shape = tensor.get_shape().with_rank(1) if shape == [0]: - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) elif tensor.op.type == "Shape": return tensor.op.inputs[0].get_shape() elif tensor.op.type == "Pack": - ret = tensor_shape.scalar() # Empty list. + ret = tensor_shape.TensorShape([]) # Empty list. # Since we expect rank 1 inputs, Pack's axis must be zero, otherwise it # would not be rank 1. assert tensor.op.get_attr("axis") == 0 @@ -855,7 +855,7 @@ def constant_value_as_shape(tensor): # pylint: disable=invalid-name # We assume that `tensor.op.inputs[0]` evaluates to 0, as this is # the only legal value when concatenating vectors, and it will # have been checked by a previous shape function. - ret = tensor_shape.scalar() # Empty list. + ret = tensor_shape.TensorShape([]) # Empty list. for concat_input in tensor.op.inputs[1:]: # `concat_input` must be a vector. Attempt to evaluate it as a shape, # and concatenate it with `ret`. @@ -865,7 +865,7 @@ def constant_value_as_shape(tensor): # pylint: disable=invalid-name # We assume that `tensor.op.inputs[-1]` evaluates to 0, as this is # the only legal value when concatenating vectors, and it will # have been checked by a previous shape function. - ret = tensor_shape.scalar() # Empty list. + ret = tensor_shape.TensorShape([]) # Empty list. for concat_input in tensor.op.inputs[:-1]: # `concat_input` must be a vector. Attempt to evaluate it as a shape, # and concatenate it with `ret`. diff --git a/tensorflow/python/grappler/datasets_test.py b/tensorflow/python/grappler/datasets_test.py index 6937301ab25..e2587633969 100644 --- a/tensorflow/python/grappler/datasets_test.py +++ b/tensorflow/python/grappler/datasets_test.py @@ -129,8 +129,9 @@ class GrapplerTest(test.TestCase): mg = meta_graph.create_meta_graph_def(graph=g) grappler_item = item.Item(mg) op_properties = grappler_item.GetOpProperties() - self.assertEqual(tensor_shape.scalar(), - op_properties['IteratorGetNext'][0].shape) + self.assertEqual( + tensor_shape.TensorShape([]), + op_properties['IteratorGetNext'][0].shape) def _testTransformation(self, fn): test_cases = [{ diff --git a/tensorflow/python/grappler/item_test.py b/tensorflow/python/grappler/item_test.py index c02fd9f55b8..3ec901a15ea 100644 --- a/tensorflow/python/grappler/item_test.py +++ b/tensorflow/python/grappler/item_test.py @@ -80,7 +80,7 @@ class ItemTest(test.TestCase): else: self.assertEqual(1, len(node_prop)) self.assertEqual(dtypes.int32, node_prop[0].dtype) - self.assertEqual(tensor_shape.scalar(), node_prop[0].shape) + self.assertEqual(tensor_shape.TensorShape([]), node_prop[0].shape) def testUpdates(self): with ops.Graph().as_default() as g: diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py index be11d4a88eb..9bc9f303d91 100644 --- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py +++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py @@ -391,7 +391,7 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase): b = control_flow_ops.cond( constant_op.constant(True), lambda: math_ops.square(x), lambda: math_ops.subtract(x, 1.)) - self.assertEqual(b.shape, tensor_shape.scalar()) + self.assertEqual(b.shape, tensor_shape.TensorShape([])) @test_util.run_v1_only("b/120545219") def testFetchable(self): diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py index f6046f425c5..052e012187c 100644 --- a/tensorflow/python/kernel_tests/list_ops_test.py +++ b/tensorflow/python/kernel_tests/list_ops_test.py @@ -1166,10 +1166,10 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase): self.assertEqual(fn(tensor_shape.unknown_shape()), -1) # Scalar shape -> [] with type int32. self.assertEqual(fn([]).dtype, dtypes.int32) - self.assertEqual(fn(tensor_shape.scalar()).dtype, dtypes.int32) + self.assertEqual(fn(tensor_shape.TensorShape([])).dtype, dtypes.int32) self.assertAllEqual(self.evaluate(fn([])), np.array([], np.int32)) self.assertAllEqual( - self.evaluate(fn(tensor_shape.scalar())), np.array([], np.int32)) + self.evaluate(fn(tensor_shape.TensorShape([]))), np.array([], np.int32)) # Tensor -> Tensor shape = constant_op.constant(1) self.assertIs(fn(shape), shape) @@ -1327,7 +1327,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase): def testConcatListWithScalarElementShapeFails(self): l = list_ops.empty_tensor_list( - element_dtype=dtypes.float32, element_shape=tensor_shape.scalar()) + element_dtype=dtypes.float32, + element_shape=tensor_shape.TensorShape([])) with self.assertRaisesRegexp( errors.InvalidArgumentError, "Concat requires elements to be at least vectors, " diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py index 1d2a0e727a7..1cdfdf0436d 100644 --- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py +++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py @@ -1034,7 +1034,7 @@ class TensorArrayTest(test.TestCase): dtype=dtypes.float32, size=num_steps, clear_after_read=False, - element_shape=tensor_shape.scalar()) + element_shape=tensor_shape.TensorShape([])) i = constant_op.constant(0, name="i") c = lambda i, acc: i < 5 @@ -1693,10 +1693,10 @@ class TensorArrayTest(test.TestCase): self.assertEqual(dtypes.float32, ta0.dtype) self.assertEqual(dtypes.int32, ta1.dtype) if context.executing_eagerly(): - self.assertEqual(tensor_shape.scalar(), read0.get_shape()) + self.assertEqual(tensor_shape.TensorShape([]), read0.get_shape()) else: self.assertEqual(tensor_shape.unknown_shape(), read0.get_shape()) - self.assertEqual(tensor_shape.scalar(), read1.get_shape()) + self.assertEqual(tensor_shape.TensorShape([]), read1.get_shape()) if not context.executing_eagerly(): self.evaluate(variables.global_variables_initializer()) diff --git a/tensorflow/python/ops/accumulate_n_benchmark.py b/tensorflow/python/ops/accumulate_n_benchmark.py index a709066cae4..08349003dc3 100644 --- a/tensorflow/python/ops/accumulate_n_benchmark.py +++ b/tensorflow/python/ops/accumulate_n_benchmark.py @@ -60,7 +60,7 @@ class AccumulateNBenchmark(test.Benchmark): return self._AccumulateNTemplate( inputs, init=array_ops.zeros_like(gen_control_flow_ops.merge(inputs)[0]), - shape=tensor_shape.vector(0), + shape=tensor_shape.TensorShape([0]), validate_shape=False) def _AccumulateNInitializedWithShape(self, inputs): diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index 063c081f4c6..981d531cdc2 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -1307,8 +1307,7 @@ def concat(values, axis, name="concat"): with ops.name_scope(name) as scope: ops.convert_to_tensor( axis, name="concat_dim", - dtype=dtypes.int32).get_shape().assert_is_compatible_with( - tensor_shape.scalar()) + dtype=dtypes.int32).get_shape().assert_has_rank(0) return identity(values[0], name=scope) return gen_array_ops.concat_v2(values=values, axis=axis, name=name) diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py index 9c49fc85270..38253320181 100644 --- a/tensorflow/python/ops/data_flow_ops.py +++ b/tensorflow/python/ops/data_flow_ops.py @@ -1092,8 +1092,8 @@ class Barrier(object): else: batch_dim = tensor_shape.Dimension( tensor_util.constant_value(op.inputs[1])) - op.outputs[0].set_shape(tensor_shape.vector(batch_dim)) # indices - op.outputs[1].set_shape(tensor_shape.vector(batch_dim)) # keys + op.outputs[0].set_shape(tensor_shape.TensorShape([batch_dim])) # indices + op.outputs[1].set_shape(tensor_shape.TensorShape([batch_dim])) # keys for output, shape in zip(op.outputs[2:], self._shapes): # value_list output.set_shape( tensor_shape.TensorShape([batch_dim]).concatenate(shape)) diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py index 4fb598aef4d..d77b3d14627 100644 --- a/tensorflow/python/ops/distributions/bernoulli.py +++ b/tensorflow/python/ops/distributions/bernoulli.py @@ -120,7 +120,7 @@ class Bernoulli(distribution.Distribution): return array_ops.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) def _sample_n(self, n, seed=None): new_shape = array_ops.concat([[n], self.batch_shape_tensor()], 0) diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py index c1ec6ed6c69..9460627d5d7 100644 --- a/tensorflow/python/ops/distributions/beta.py +++ b/tensorflow/python/ops/distributions/beta.py @@ -238,7 +238,7 @@ class Beta(distribution.Distribution): return constant_op.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) def _sample_n(self, n, seed=None): expanded_concentration1 = array_ops.ones_like( diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py index 33a84356250..1b2a8f53a72 100644 --- a/tensorflow/python/ops/distributions/categorical.py +++ b/tensorflow/python/ops/distributions/categorical.py @@ -266,7 +266,7 @@ class Categorical(distribution.Distribution): return constant_op.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) def _sample_n(self, n, seed=None): if self.logits.get_shape().ndims == 2: diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py index 6fb105c2cbe..a459697fbce 100644 --- a/tensorflow/python/ops/distributions/gamma.py +++ b/tensorflow/python/ops/distributions/gamma.py @@ -210,7 +210,7 @@ class Gamma(distribution.Distribution): return constant_op.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) @distribution_util.AppendDocstring( """Note: See `tf.random.gamma` docstring for sampling details and diff --git a/tensorflow/python/ops/distributions/laplace.py b/tensorflow/python/ops/distributions/laplace.py index a96b58ba1a6..02ec64f0e26 100644 --- a/tensorflow/python/ops/distributions/laplace.py +++ b/tensorflow/python/ops/distributions/laplace.py @@ -153,7 +153,7 @@ class Laplace(distribution.Distribution): return constant_op.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) def _sample_n(self, n, seed=None): shape = array_ops.concat([[n], self.batch_shape_tensor()], 0) diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py index 0b36054db2f..4c77cf9120c 100644 --- a/tensorflow/python/ops/distributions/normal.py +++ b/tensorflow/python/ops/distributions/normal.py @@ -189,7 +189,7 @@ class Normal(distribution.Distribution): return constant_op.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) def _sample_n(self, n, seed=None): shape = array_ops.concat([[n], self.batch_shape_tensor()], 0) diff --git a/tensorflow/python/ops/distributions/student_t.py b/tensorflow/python/ops/distributions/student_t.py index efc3290592d..4a5d3ea0d84 100644 --- a/tensorflow/python/ops/distributions/student_t.py +++ b/tensorflow/python/ops/distributions/student_t.py @@ -241,7 +241,7 @@ class StudentT(distribution.Distribution): return constant_op.constant([], dtype=math_ops.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) def _sample_n(self, n, seed=None): # The sampling method comes from the fact that if: diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py index 0221ccff78c..64fb0eadee7 100644 --- a/tensorflow/python/ops/distributions/uniform.py +++ b/tensorflow/python/ops/distributions/uniform.py @@ -165,7 +165,7 @@ class Uniform(distribution.Distribution): return constant_op.constant([], dtype=dtypes.int32) def _event_shape(self): - return tensor_shape.scalar() + return tensor_shape.TensorShape([]) def _sample_n(self, n, seed=None): shape = array_ops.concat([[n], self.batch_shape_tensor()], 0) diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py index 802a5b2d261..a1f15059c40 100644 --- a/tensorflow/python/ops/lookup_ops.py +++ b/tensorflow/python/ops/lookup_ops.py @@ -166,7 +166,7 @@ class InitializableLookupTableBase(LookupInterface): initializer.value_dtype) self._default_value = ops.convert_to_tensor( default_value, dtype=self._value_dtype) - self._default_value.get_shape().merge_with(tensor_shape.scalar()) + self._default_value.get_shape().merge_with(tensor_shape.TensorShape([])) if isinstance(initializer, trackable_base.Trackable): self._initializer = self._track_trackable(initializer, "_initializer") with ops.init_scope(): diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index 418a34fce50..f5e9aea7194 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -2282,7 +2282,8 @@ def atrous_conv2d_transpose(value, data_format="NHWC") output_shape_ = ops.convert_to_tensor(output_shape, name="output_shape") - if not output_shape_.get_shape().is_compatible_with(tensor_shape.vector(4)): + if not output_shape_.get_shape().is_compatible_with( + tensor_shape.TensorShape([4])): raise ValueError("output_shape must have shape (4,), got {}".format( output_shape_.get_shape())) @@ -4233,7 +4234,7 @@ def dropout_v2(x, rate, noise_shape=None, seed=None, name=None): else: rate = ops.convert_to_tensor( rate, dtype=x.dtype, name="rate") - rate.get_shape().assert_is_compatible_with(tensor_shape.scalar()) + rate.get_shape().assert_has_rank(0) # Do nothing if we know rate == 0 if tensor_util.constant_value(rate) == 0: diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py index 8007fd78954..41802aabbb4 100644 --- a/tensorflow/python/ops/tensor_array_ops.py +++ b/tensorflow/python/ops/tensor_array_ops.py @@ -1338,8 +1338,8 @@ class TensorArraySpec(type_spec.TypeSpec): def _to_legacy_output_shapes(self): # Sneak the dynamic_size and infer_shape values into the legacy shape. - return (tensor_shape.matrix(self._dynamic_size, self._infer_shape) - .concatenate(self._element_shape)) + return (tensor_shape.TensorShape([self._dynamic_size, self._infer_shape + ]).concatenate(self._element_shape)) def _to_legacy_output_classes(self): return TensorArray diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py index 420818920a9..7527c5cfd3e 100644 --- a/tensorflow/python/ops/while_v2.py +++ b/tensorflow/python/ops/while_v2.py @@ -107,8 +107,7 @@ def while_loop(cond, # Add loop counter needed for computing gradients. loop_vars = [loop_counter, maximum_iterations_loop_var] + loop_vars - shape_invariants = ( - [tensor_shape.scalar(), tensor_shape.scalar()] + shape_invariants) + shape_invariants = [tensor_shape.TensorShape([])] * 2 + shape_invariants signature = ( [tensor_spec.TensorSpec.from_tensor(loop_counter), tensor_spec.TensorSpec.from_tensor(maximum_iterations_loop_var)] + From c952a36db96a4b39337f22b4c7acb3661657155c Mon Sep 17 00:00:00 2001 From: Igor Ganichev Date: Tue, 23 Jul 2019 15:08:36 -0700 Subject: [PATCH 0419/3053] Add debug string to client streaming exchanges and some other logs PiperOrigin-RevId: 259620385 --- .../rpc/eager/grpc_eager_client.cc | 3 ++ .../core/distributed_runtime/rpc/grpc_call.h | 8 ++++- .../distributed_runtime/rpc/grpc_state.cc | 10 ++++-- .../core/distributed_runtime/rpc/grpc_state.h | 34 ++++++++++++++++--- 4 files changed, 47 insertions(+), 8 deletions(-) diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc index b3164f0956e..da5d43abe72 100644 --- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc +++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc @@ -147,10 +147,13 @@ class GrpcEagerClientCache : public EagerClientCache { void* tag; bool ok; while (completion_queue_.Next(&tag, &ok)) { + VLOG(4) << "GrpcEagerClientThread got next tag"; GrpcClientCQTag* callback_tag = static_cast(tag); callback_tag->OnCompleted(ok); + VLOG(4) << "GrpcEagerClientThread blocking for next tag"; } + VLOG(4) << "GrpcEagerClientThread exiting"; })); } diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_call.h b/tensorflow/core/distributed_runtime/rpc/grpc_call.h index 8809c1e6b19..e85baac0f70 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_call.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_call.h @@ -425,7 +425,13 @@ class ServerBidirectionalStreamingCall stream_(&ctx_), grpc_service_(grpc_service), cq_(cq), - enqueue_function_(enqueue_function) {} + enqueue_function_(enqueue_function) { + VLOG(3) << "Creating ServerBidirectionalStreamingCall " << this; + } + + ~ServerBidirectionalStreamingCall() override { + VLOG(3) << "Destroying ServerBidirectionalStreamingCall " << this; + } void CallOpen() override { // Let gRPC know that we can accept another call. diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.cc b/tensorflow/core/distributed_runtime/rpc/grpc_state.cc index 7626891d898..75e4153da40 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_state.cc +++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.cc @@ -54,6 +54,8 @@ void Exchange::Complete(Status status) { status.Update(errors::Internal("could not parse rpc response")); } } + VLOG(3) << "Completing exchange " << DebugString() << " with " + << status.ToString(); cb_(status); } @@ -76,12 +78,14 @@ const char* ToString(Exchange::State state) { } string Exchange::DebugString() const { - return absl::StrFormat("%p@%s", this, ToString(state_)); + return absl::StrFormat("%p@%s_%s", this, ToString(state_), debug_string_); } void ExchangeQueue::Emplace(const ::grpc::ByteBuffer& request_buf, - protobuf::Message* response, StatusCallback cb) { - exchanges_.emplace(exchanges_.end(), request_buf, response, std::move(cb)); + protobuf::Message* response, StatusCallback cb, + string debug_string) { + exchanges_.emplace(exchanges_.end(), request_buf, response, std::move(cb), + debug_string); } Exchange* ExchangeQueue::GetReadyForRequestWriting() { diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h index b12218206d3..10c9af37056 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h @@ -237,11 +237,12 @@ class Exchange { }; Exchange(const ::grpc::ByteBuffer& request_buf, protobuf::Message* response, - StatusCallback cb) + StatusCallback cb, string debug_string) : state_(State::kExchangeCreated), request_buf_(request_buf), response_(response), - cb_(std::move(cb)) {} + cb_(std::move(cb)), + debug_string_(std::move(debug_string)) {} const ::grpc::ByteBuffer& request_buf() { return request_buf_; } ::grpc::ByteBuffer* response_buf() { return &response_buf_; } @@ -274,6 +275,7 @@ class Exchange { ::grpc::ByteBuffer response_buf_; protobuf::Message* response_; StatusCallback cb_; + string debug_string_; }; const char* ToString(Exchange::State s); @@ -303,7 +305,8 @@ class ExchangeQueue { public: // Creates a new exchange and adds it to the end of the queue. void Emplace(const ::grpc::ByteBuffer& request_buf, - protobuf::Message* response, StatusCallback cb); + protobuf::Message* response, StatusCallback cb, + std::string debug_string); // Returns an exchange for which we can initiated request writing, if any. // Returns nullptr if there is no such exchange. @@ -363,9 +366,15 @@ class StreamingRPCState : public UntypedStreamingRPCState { const std::shared_ptr<::grpc::ClientContext>& context) : context_(context), call_(std::move(call)), call_done_(false) { Ref(); + VLOG(3) << "Created new StreamingRPCState " << this; + VLOG(3) << "StreamingRPCState(" << this << ") calling grpc::StartCall"; call_->StartCall(&call_started_tag_); } + ~StreamingRPCState() override { + VLOG(3) << "Destructing StreamingRPCState " << this; + } + // Attempts to send the next request. `done` is invoked when // `response` has been filled with the data from the server, or if there // is an error. `done` can be invoked before SendNextRequest returns. @@ -391,12 +400,21 @@ class StreamingRPCState : public UntypedStreamingRPCState { // `done` is not invoked intentionally. return false; } - exchanges_.Emplace(request_buf, response, done); + if (VLOG_IS_ON(3)) { + // If vlog 3 is enabled, include first 100 chars of request as debug + // string. + exchanges_.Emplace(request_buf, response, done, + request.ShortDebugString().substr(0, 100)); + } else { + exchanges_.Emplace(request_buf, response, done, ""); + } MaybeIssueRequestWriteLocked(); return true; } void CallStarted(bool ok) override { + VLOG(3) << "StreamingRPCState(" << this << ")::CallStarted(ok=" << ok + << ")"; mutex_lock l(mu_); if (!ok) { call_done_ = true; @@ -408,6 +426,8 @@ class StreamingRPCState : public UntypedStreamingRPCState { } void RequestWriteCompleted(bool ok) override { + VLOG(3) << "StreamingRPCState(" << this + << ")::RequestWriteCompleted(ok=" << ok << ")"; mu_.lock(); if (call_done_) { mu_.unlock(); @@ -426,6 +446,8 @@ class StreamingRPCState : public UntypedStreamingRPCState { } void ResponseReadCompleted(bool ok) override { + VLOG(3) << "StreamingRPCState(" << this + << ")::ResponseReadCompleted(ok=" << ok << ")"; mu_.lock(); if (call_done_) { mu_.unlock(); @@ -466,6 +488,8 @@ class StreamingRPCState : public UntypedStreamingRPCState { call_done_ = true; Status status = errors::Unknown("gRPC streaming call has ended: ", context_->debug_error_string()); + VLOG(2) << "Ending gRPC stremaing call on the client side due to " + << status.ToString(); // Swap the exchanges_ into a temporary ExchangeQueue so that we can // complete all exchanges without holding mu_ in case user callback // reach back into this. This should be impossible now, but safer for @@ -485,6 +509,7 @@ class StreamingRPCState : public UntypedStreamingRPCState { } exchange->MarkRequestWriteIssued(); Ref(); + VLOG(3) << "StreamingRPCState(" << this << ") calling grpc::Write"; call_->Write(exchange->request_buf(), &request_write_completed_tag_); } @@ -495,6 +520,7 @@ class StreamingRPCState : public UntypedStreamingRPCState { } exchange->MarkResponseReadIssued(); Ref(); + VLOG(3) << "StreamingRPCState(" << this << ") calling grpc::Read"; call_->Read(exchange->response_buf(), &response_read_completed_tag_); } From ea461a480f89de36bd318fd3dadf9e63f7eb0694 Mon Sep 17 00:00:00 2001 From: Priya Gupta Date: Tue, 23 Jul 2019 15:12:12 -0700 Subject: [PATCH 0420/3053] Add missing experimental_between_graph implementation in OneDeviceStrategy. PiperOrigin-RevId: 259621032 --- tensorflow/python/distribute/one_device_strategy.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py index 6a79b86a5fd..063242ad02a 100644 --- a/tensorflow/python/distribute/one_device_strategy.py +++ b/tensorflow/python/distribute/one_device_strategy.py @@ -403,6 +403,10 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1): def experimental_should_init(self): return True + @property + def experimental_between_graph(self): + return False + @property def should_checkpoint(self): return True From 4b836de9505ecff1ce4bd99a77520752def4bee9 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Tue, 23 Jul 2019 15:38:35 -0700 Subject: [PATCH 0421/3053] Delete mlir/tensorflow/tests/graphdef2mlir/graph-11c8752c150e5643.pbtxt and graph-1383300d74bd0b22.pbtxt These test were part of the original tests for the graphdef import, however they are low value and not targeted which makes it heavier to maintain for not clear benefit in coverage. PiperOrigin-RevId: 259626278 --- .../graph-11c8752c150e5643.pbtxt | 99 -- .../graph-1383300d74bd0b22.pbtxt | 1550 ----------------- 2 files changed, 1649 deletions(-) delete mode 100644 tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-11c8752c150e5643.pbtxt delete mode 100644 tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-1383300d74bd0b22.pbtxt diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-11c8752c150e5643.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-11c8752c150e5643.pbtxt deleted file mode 100644 index b2dd870d66b..00000000000 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-11c8752c150e5643.pbtxt +++ /dev/null @@ -1,99 +0,0 @@ -# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s - -node { - name: "Empty/shape" - op: "Const" - device: "/job:localhost/replica:0/task:0/device:TPU:0" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - dim { - size: 2 - } - } - tensor_content: "\200\000\000\000\200\000\000\000" - } - } - } -} -node { - name: "Empty" - op: "Empty" - input: "Empty/shape" - device: "/job:localhost/replica:0/task:0/device:TPU:0" - attr { - key: "dtype" - value { - type: DT_BFLOAT16 - } - } - attr { - key: "init" - value { - b: false - } - } -} -node { - name: "Empty/_0" - op: "_Send" - input: "Empty" - device: "/job:localhost/replica:0/task:0/device:TPU:0" - attr { - key: "T" - value { - type: DT_BFLOAT16 - } - } - attr { - key: "client_terminated" - value { - b: false - } - } - attr { - key: "recv_device" - value { - s: "/job:localhost/replica:0/task:0/device:CPU:0" - } - } - attr { - key: "send_device" - value { - s: "/job:localhost/replica:0/task:0/device:TPU:0" - } - } - attr { - key: "send_device_incarnation" - value { - i: 1 - } - } - attr { - key: "tensor_name" - value { - s: "edge_5_Empty" - } - } -} -library { -} -versions { - producer: 26 -} - -# CHECK: func @main() { -# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "/job:localhost/replica:0/task:0/device:TPU:0", dtype = "tfdtype$DT_INT32", name = "Empty/shape", value = dense<128> : tensor<2xi32>} : () -> (tensor<2xi32>, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Empty"(%0#0) {device = "/job:localhost/replica:0/task:0/device:TPU:0", dtype = "tfdtype$DT_BFLOAT16", init = false, name = "Empty"} : (tensor<2xi32>) -> (tensor<128x128xbf16>, !_tf.control) -# CHECK-NEXT: %2 = "_tf._Send"(%1#0) {T = "tfdtype$DT_BFLOAT16", client_terminated = false, device = "/job:localhost/replica:0/task:0/device:TPU:0", name = "Empty/_0", recv_device = "/job:localhost/replica:0/task:0/device:CPU:0", send_device = "/job:localhost/replica:0/task:0/device:TPU:0", send_device_incarnation = 1 : i64, tensor_name = "edge_5_Empty"} : (tensor<128x128xbf16>) -> !_tf.control -# CHECK-NEXT: return -# CHECK-NEXT: } diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-1383300d74bd0b22.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-1383300d74bd0b22.pbtxt deleted file mode 100644 index 0333193be8d..00000000000 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-1383300d74bd0b22.pbtxt +++ /dev/null @@ -1,1550 +0,0 @@ -# RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s - -node { - name: "placeholder_0_arg" - op: "_Arg" - device: "/device:TPU_REPLICATED_CORE:0" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "index" - value { - i: 0 - } - } -} -node { - name: "tpu/tpu/Shape" - op: "Shape" - input: "placeholder_0_arg" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "out_type" - value { - type: DT_INT32 - } - } -} -node { - name: "tpu/tpu/strided_slice/stack" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - dim { - size: 1 - } - } - int_val: 0 - } - } - } -} -node { - name: "tpu/tpu/strided_slice/stack_1" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - dim { - size: 1 - } - } - int_val: 1 - } - } - } -} -node { - name: "tpu/tpu/strided_slice/stack_2" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - dim { - size: 1 - } - } - int_val: 1 - } - } - } -} -node { - name: "tpu/tpu/strided_slice" - op: "StridedSlice" - input: "tpu/tpu/Shape" - input: "tpu/tpu/strided_slice/stack" - input: "tpu/tpu/strided_slice/stack_1" - input: "tpu/tpu/strided_slice/stack_2" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "Index" - value { - type: DT_INT32 - } - } - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "begin_mask" - value { - i: 0 - } - } - attr { - key: "ellipsis_mask" - value { - i: 0 - } - } - attr { - key: "end_mask" - value { - i: 0 - } - } - attr { - key: "new_axis_mask" - value { - i: 0 - } - } - attr { - key: "shrink_axis_mask" - value { - i: 1 - } - } -} -node { - name: "tpu/tpu/Plus1RNNCellZeroState/ExpandDims/dim" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 0 - } - } - } -} -node { - name: "tpu/tpu/Plus1RNNCellZeroState/ExpandDims" - op: "ExpandDims" - input: "tpu/tpu/strided_slice" - input: "tpu/tpu/Plus1RNNCellZeroState/ExpandDims/dim" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "Tdim" - value { - type: DT_INT32 - } - } -} -node { - name: "tpu/tpu/Plus1RNNCellZeroState/Const" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - dim { - size: 1 - } - } - int_val: 1 - } - } - } -} -node { - name: "tpu/tpu/Plus1RNNCellZeroState/concat/axis" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 0 - } - } - } -} -node { - name: "tpu/tpu/Plus1RNNCellZeroState/concat" - op: "ConcatV2" - input: "tpu/tpu/Plus1RNNCellZeroState/ExpandDims" - input: "tpu/tpu/Plus1RNNCellZeroState/Const" - input: "tpu/tpu/Plus1RNNCellZeroState/concat/axis" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "Tidx" - value { - type: DT_INT32 - } - } -} -node { - name: "tpu/tpu/Plus1RNNCellZeroState/zeros/Const" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_FLOAT - tensor_shape { - } - float_val: 0 - } - } - } -} -node { - name: "tpu/tpu/Plus1RNNCellZeroState/zeros" - op: "Fill" - input: "tpu/tpu/Plus1RNNCellZeroState/concat" - input: "tpu/tpu/Plus1RNNCellZeroState/zeros/Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "index_type" - value { - type: DT_INT32 - } - } -} -node { - name: "tpu/tpu/Plus1RNNCellZeroState/ExpandDims_1/dim" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 0 - } - } - } -} -node { - name: "tpu/tpu/Plus1RNNCellZeroState/ExpandDims_1" - op: "ExpandDims" - input: "tpu/tpu/strided_slice" - input: "tpu/tpu/Plus1RNNCellZeroState/ExpandDims_1/dim" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "Tdim" - value { - type: DT_INT32 - } - } -} -node { - name: "tpu/tpu/Plus1RNNCellZeroState/Const_1" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - dim { - size: 1 - } - } - int_val: 1 - } - } - } -} -node { - name: "tpu/tpu/sequence_length" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - dim { - size: 1 - } - } - int_val: 1 - } - } - } -} -node { - name: "tpu/tpu/ExpandDims/dim" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 0 - } - } - } -} -node { - name: "tpu/tpu/ExpandDims" - op: "ExpandDims" - input: "tpu/tpu/strided_slice" - input: "tpu/tpu/ExpandDims/dim" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "Tdim" - value { - type: DT_INT32 - } - } -} -node { - name: "tpu/tpu/Const" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - dim { - size: 1 - } - } - int_val: 1 - } - } - } -} -node { - name: "tpu/tpu/concat/axis" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 0 - } - } - } -} -node { - name: "tpu/tpu/concat" - op: "ConcatV2" - input: "tpu/tpu/ExpandDims" - input: "tpu/tpu/Const" - input: "tpu/tpu/concat/axis" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "Tidx" - value { - type: DT_INT32 - } - } -} -node { - name: "tpu/tpu/zeros/Const" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_FLOAT - tensor_shape { - } - float_val: 0 - } - } - } -} -node { - name: "tpu/tpu/zeros" - op: "Fill" - input: "tpu/tpu/concat" - input: "tpu/tpu/zeros/Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "index_type" - value { - type: DT_INT32 - } - } -} -node { - name: "tpu/tpu/Const_1" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - dim { - size: 1 - } - } - int_val: 1 - } - } - } -} -node { - name: "tpu/tpu/Const_2" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - dim { - size: 1 - } - } - int_val: 0 - } - } - } -} -node { - name: "tpu/tpu/Min" - op: "Min" - input: "tpu/tpu/sequence_length" - input: "tpu/tpu/Const_2" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "Tidx" - value { - type: DT_INT32 - } - } - attr { - key: "keep_dims" - value { - b: false - } - } -} -node { - name: "tpu/tpu/Const_3" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - dim { - size: 1 - } - } - int_val: 0 - } - } - } -} -node { - name: "tpu/tpu/Max" - op: "Max" - input: "tpu/tpu/sequence_length" - input: "tpu/tpu/Const_3" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "Tidx" - value { - type: DT_INT32 - } - } - attr { - key: "keep_dims" - value { - b: false - } - } -} -node { - name: "tpu/tpu/LessEqual/y" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 0 - } - } - } -} -node { - name: "tpu/tpu/LessEqual" - op: "LessEqual" - input: "tpu/tpu/sequence_length" - input: "tpu/tpu/LessEqual/y" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_INT32 - } - } -} -node { - name: "tpu/tpu/LessEqual_1/y" - op: "Const" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 0 - } - } - } -} -node { - name: "tpu/tpu/LessEqual_1" - op: "LessEqual" - input: "tpu/tpu/Max" - input: "tpu/tpu/LessEqual_1/y" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_INT32 - } - } -} -node { - name: "tpu/tpu/cond/Switch" - op: "Switch" - input: "tpu/tpu/LessEqual_1" - input: "tpu/tpu/LessEqual_1" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_BOOL - } - } -} -node { - name: "tpu/tpu/cond/switch_t" - op: "Identity" - input: "tpu/tpu/cond/Switch:1" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_BOOL - } - } -} -node { - name: "tpu/tpu/cond/switch_f" - op: "Identity" - input: "tpu/tpu/cond/Switch" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_BOOL - } - } -} -node { - name: "tpu/tpu/cond/pred_id" - op: "Identity" - input: "tpu/tpu/LessEqual_1" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_BOOL - } - } -} -node { - name: "tpu/tpu/cond/Switch_1" - op: "Switch" - input: "tpu/tpu/zeros" - input: "tpu/tpu/cond/pred_id" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@tpu/tpu/zeros" - } - } - } -} -node { - name: "tpu/tpu/cond/Switch_2" - op: "Switch" - input: "tpu/tpu/Plus1RNNCellZeroState/zeros" - input: "tpu/tpu/cond/pred_id" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@tpu/tpu/Plus1RNNCellZeroState/zeros" - } - } - } -} -node { - name: "tpu/tpu/cond/add/y" - op: "Const" - input: "^tpu/tpu/cond/switch_f" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_FLOAT - tensor_shape { - } - float_val: 1 - } - } - } -} -node { - name: "tpu/tpu/cond/add/Switch" - op: "Switch" - input: "placeholder_0_arg" - input: "tpu/tpu/cond/pred_id" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@Placeholder" - } - } - } -} -node { - name: "tpu/tpu/cond/add" - op: "Add" - input: "tpu/tpu/cond/add/Switch" - input: "tpu/tpu/cond/add/y" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } -} -node { - name: "tpu/tpu/cond/add_1/y" - op: "Const" - input: "^tpu/tpu/cond/switch_f" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_FLOAT - tensor_shape { - } - float_val: 1 - } - } - } -} -node { - name: "tpu/tpu/cond/add_1/Switch" - op: "Switch" - input: "tpu/tpu/Plus1RNNCellZeroState/zeros" - input: "tpu/tpu/cond/pred_id" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@tpu/tpu/Plus1RNNCellZeroState/zeros" - } - } - } -} -node { - name: "tpu/tpu/cond/add_1" - op: "Add" - input: "tpu/tpu/cond/add_1/Switch" - input: "tpu/tpu/cond/add_1/y" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } -} -node { - name: "tpu/tpu/cond/Greater/y" - op: "Const" - input: "^tpu/tpu/cond/switch_f" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 0 - } - } - } -} -node { - name: "tpu/tpu/cond/Greater/Switch" - op: "Switch" - input: "tpu/tpu/Min" - input: "tpu/tpu/cond/pred_id" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_class" - value { - list { - s: "loc:@tpu/tpu/Min" - } - } - } -} -node { - name: "tpu/tpu/cond/Greater" - op: "Greater" - input: "tpu/tpu/cond/Greater/Switch" - input: "tpu/tpu/cond/Greater/y" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_INT32 - } - } -} -node { - name: "tpu/tpu/cond/cond/Switch" - op: "Switch" - input: "tpu/tpu/cond/Greater" - input: "tpu/tpu/cond/Greater" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_BOOL - } - } -} -node { - name: "tpu/tpu/cond/cond/switch_t" - op: "Identity" - input: "tpu/tpu/cond/cond/Switch:1" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_BOOL - } - } -} -node { - name: "tpu/tpu/cond/cond/switch_f" - op: "Identity" - input: "tpu/tpu/cond/cond/Switch" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_BOOL - } - } -} -node { - name: "tpu/tpu/cond/cond/pred_id" - op: "Identity" - input: "tpu/tpu/cond/Greater" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_BOOL - } - } -} -node { - name: "tpu/tpu/cond/cond/Switch_1" - op: "Switch" - input: "tpu/tpu/cond/add" - input: "tpu/tpu/cond/cond/pred_id" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@tpu/tpu/cond/add" - } - } - } -} -node { - name: "tpu/tpu/cond/cond/Switch_2" - op: "Switch" - input: "tpu/tpu/cond/add_1" - input: "tpu/tpu/cond/cond/pred_id" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@tpu/tpu/cond/add_1" - } - } - } -} -node { - name: "tpu/tpu/cond/cond/Select/Switch" - op: "Switch" - input: "tpu/tpu/LessEqual" - input: "tpu/tpu/cond/pred_id" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_BOOL - } - } - attr { - key: "_class" - value { - list { - s: "loc:@tpu/tpu/LessEqual" - } - } - } -} -node { - name: "tpu/tpu/cond/cond/Select/Switch_1" - op: "Switch" - input: "tpu/tpu/cond/cond/Select/Switch" - input: "tpu/tpu/cond/cond/pred_id" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_BOOL - } - } - attr { - key: "_class" - value { - list { - s: "loc:@tpu/tpu/LessEqual" - } - } - } -} -node { - name: "tpu/tpu/cond/cond/Select/Switch_2" - op: "Switch" - input: "tpu/tpu/zeros" - input: "tpu/tpu/cond/pred_id" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@tpu/tpu/zeros" - } - } - } -} -node { - name: "tpu/tpu/cond/cond/Select/Switch_3" - op: "Switch" - input: "tpu/tpu/cond/cond/Select/Switch_2" - input: "tpu/tpu/cond/cond/pred_id" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@tpu/tpu/zeros" - } - } - } -} -node { - name: "tpu/tpu/cond/cond/Select/Switch_4" - op: "Switch" - input: "tpu/tpu/cond/add" - input: "tpu/tpu/cond/cond/pred_id" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@tpu/tpu/cond/add" - } - } - } -} -node { - name: "tpu/tpu/cond/cond/Select" - op: "Select" - input: "tpu/tpu/cond/cond/Select/Switch_1" - input: "tpu/tpu/cond/cond/Select/Switch_3" - input: "tpu/tpu/cond/cond/Select/Switch_4" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@tpu/tpu/cond/add" - } - } - } -} -node { - name: "tpu/tpu/cond/cond/Select_1/Switch" - op: "Switch" - input: "tpu/tpu/cond/add_1/Switch" - input: "tpu/tpu/cond/cond/pred_id" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@tpu/tpu/Plus1RNNCellZeroState/zeros" - } - } - } -} -node { - name: "tpu/tpu/cond/cond/Select_1/Switch_1" - op: "Switch" - input: "tpu/tpu/cond/add_1" - input: "tpu/tpu/cond/cond/pred_id" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@tpu/tpu/cond/add_1" - } - } - } -} -node { - name: "tpu/tpu/cond/cond/Select_1" - op: "Select" - input: "tpu/tpu/cond/cond/Select/Switch_1" - input: "tpu/tpu/cond/cond/Select_1/Switch" - input: "tpu/tpu/cond/cond/Select_1/Switch_1" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@tpu/tpu/cond/add_1" - } - } - } -} -node { - name: "tpu/tpu/cond/cond/Merge" - op: "Merge" - input: "tpu/tpu/cond/cond/Select" - input: "tpu/tpu/cond/cond/Switch_1:1" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_FLOAT - } - } -} -node { - name: "tpu/tpu/cond/cond/Merge_1" - op: "Merge" - input: "tpu/tpu/cond/cond/Select_1" - input: "tpu/tpu/cond/cond/Switch_2:1" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_FLOAT - } - } -} -node { - name: "tpu/tpu/cond/Merge" - op: "Merge" - input: "tpu/tpu/cond/cond/Merge" - input: "tpu/tpu/cond/Switch_1:1" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_FLOAT - } - } -} -node { - name: "tpu/tpu/cond/Merge_1" - op: "Merge" - input: "tpu/tpu/cond/cond/Merge_1" - input: "tpu/tpu/cond/Switch_2:1" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_FLOAT - } - } -} -node { - name: "tpu/NoOp" - op: "NoOp" - device: "/device:TPU_REPLICATED_CORE" -} -node { - name: "tpu/packed" - op: "Pack" - input: "tpu/tpu/cond/Merge" - device: "/device:TPU_REPLICATED_CORE:0" - attr { - key: "N" - value { - i: 1 - } - } - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "axis" - value { - i: 0 - } - } -} -node { - name: "tpu/Identity" - op: "Identity" - input: "tpu/packed" - device: "/device:TPU_REPLICATED_CORE:0" - attr { - key: "T" - value { - type: DT_FLOAT - } - } -} -node { - name: "tpu/Identity_1" - op: "Identity" - input: "tpu/tpu/cond/Merge_1" - device: "/device:TPU_REPLICATED_CORE" - attr { - key: "T" - value { - type: DT_FLOAT - } - } -} -node { - name: "tpu_identity_0_retval_RetVal" - op: "_Retval" - input: "tpu/Identity" - device: "/device:TPU_REPLICATED_CORE:0" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "index" - value { - i: 0 - } - } -} -node { - name: "tpu_identity_1_0_retval_RetVal" - op: "_Retval" - input: "tpu/Identity_1" - device: "/device:TPU_REPLICATED_CORE:0" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "index" - value { - i: 1 - } - } -} -library { -} -versions { - producer: 26 -} - -# CHECK: func @main() { -# CHECK-NEXT: %0:2 = "_tf._Arg"() {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE:0", index = 0 : i64, name = "placeholder_0_arg"} : () -> (tensor<*xf32>, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Shape"(%0#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/Shape", out_type = "tfdtype$DT_INT32"} : (tensor<*xf32>) -> (tensor, !_tf.control) -# CHECK-NEXT: %2 = "_tf.NoOp"() {device = "/device:TPU_REPLICATED_CORE", name = "tpu/NoOp"} : () -> !_tf.control -# CHECK-NEXT: %3:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Const", value = dense<1> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control) -# CHECK-NEXT: %4:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Const_1", value = dense<1> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control) -# CHECK-NEXT: %5:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Const_2", value = dense<0> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control) -# CHECK-NEXT: %6:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Const_3", value = dense<0> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control) -# CHECK-NEXT: %7:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/ExpandDims/dim", value = dense<0> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %8:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/LessEqual/y", value = dense<0> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %9:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/LessEqual_1/y", value = dense<0> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %10:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Plus1RNNCellZeroState/Const", value = dense<1> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control) -# CHECK-NEXT: %11:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Plus1RNNCellZeroState/Const_1", value = dense<1> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control) -# CHECK-NEXT: %12:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Plus1RNNCellZeroState/ExpandDims/dim", value = dense<0> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %13:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Plus1RNNCellZeroState/ExpandDims_1/dim", value = dense<0> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %14:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/Plus1RNNCellZeroState/concat/axis", value = dense<0> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %15:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_FLOAT", name = "tpu/tpu/Plus1RNNCellZeroState/zeros/Const", value = dense<0.000000e+00> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %16:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/concat/axis", value = dense<0> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %17:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/sequence_length", value = dense<1> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control) -# CHECK-NEXT: %18:2 = "_tf.LessEqual"(%17#0, %8#0) {T = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/LessEqual"} : (tensor<1xi32>, tensor) -> (tensor<1xi1>, !_tf.control) -# CHECK-NEXT: %19:2 = "_tf.Max"(%17#0, %6#0) {T = "tfdtype$DT_INT32", Tidx = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", keep_dims = false, name = "tpu/tpu/Max"} : (tensor<1xi32>, tensor<1xi32>) -> (tensor, !_tf.control) -# CHECK-NEXT: %20:2 = "_tf.LessEqual"(%19#0, %9#0) {T = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/LessEqual_1"} : (tensor, tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %21:3 = "_tf.Switch"(%20#0, %20#0) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/Switch"} : (tensor, tensor) -> (tensor, tensor, !_tf.control) -# CHECK-NEXT: %22:2 = "_tf.Identity"(%21#0) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/switch_f"} : (tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %23:2 = "_tf.Const"(%22#1) {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/cond/Greater/y", value = dense<0> : tensor} : (!_tf.control) -> (tensor, !_tf.control) -# CHECK-NEXT: %24:2 = "_tf.Const"(%22#1) {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_FLOAT", name = "tpu/tpu/cond/add/y", value = dense<1.000000e+00> : tensor} : (!_tf.control) -> (tensor, !_tf.control) -# CHECK-NEXT: %25:2 = "_tf.Const"(%22#1) {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_FLOAT", name = "tpu/tpu/cond/add_1/y", value = dense<1.000000e+00> : tensor} : (!_tf.control) -> (tensor, !_tf.control) -# CHECK-NEXT: %26:2 = "_tf.Identity"(%21#1) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/switch_t"} : (tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %27:2 = "_tf.Identity"(%20#0) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/pred_id"} : (tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %28:3 = "_tf.Switch"(%0#0, %27#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@Placeholder"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/add/Switch"} : (tensor<*xf32>, tensor) -> (tensor<*xf32>, tensor<*xf32>, !_tf.control) -# CHECK-NEXT: %29:2 = "_tf.Add"(%28#0, %24#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/add"} : (tensor<*xf32>, tensor) -> (tensor<*xf32>, !_tf.control) -# CHECK-NEXT: %30:3 = "_tf.Switch"(%18#0, %27#0) {T = "tfdtype$DT_BOOL", _class = ["loc:@tpu/tpu/LessEqual"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select/Switch"} : (tensor<1xi1>, tensor) -> (tensor<1xi1>, tensor<1xi1>, !_tf.control) -# CHECK-NEXT: %31:2 = "_tf.Min"(%17#0, %5#0) {T = "tfdtype$DT_INT32", Tidx = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", keep_dims = false, name = "tpu/tpu/Min"} : (tensor<1xi32>, tensor<1xi32>) -> (tensor, !_tf.control) -# CHECK-NEXT: %32:3 = "_tf.Switch"(%31#0, %27#0) {T = "tfdtype$DT_INT32", _class = ["loc:@tpu/tpu/Min"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/Greater/Switch"} : (tensor, tensor) -> (tensor, tensor, !_tf.control) -# CHECK-NEXT: %33:2 = "_tf.Greater"(%32#0, %23#0) {T = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/Greater"} : (tensor, tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %34:3 = "_tf.Switch"(%33#0, %33#0) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Switch"} : (tensor, tensor) -> (tensor, tensor, !_tf.control) -# CHECK-NEXT: %35:2 = "_tf.Identity"(%34#0) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/switch_f"} : (tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %36:2 = "_tf.Identity"(%34#1) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/switch_t"} : (tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %37:2 = "_tf.Identity"(%33#0) {T = "tfdtype$DT_BOOL", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/pred_id"} : (tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %38:3 = "_tf.Switch"(%30#0, %37#0) {T = "tfdtype$DT_BOOL", _class = ["loc:@tpu/tpu/LessEqual"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select/Switch_1"} : (tensor<1xi1>, tensor) -> (tensor<1xi1>, tensor<1xi1>, !_tf.control) -# CHECK-NEXT: %39:3 = "_tf.Switch"(%29#0, %37#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/cond/add"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select/Switch_4"} : (tensor<*xf32>, tensor) -> (tensor<*xf32>, tensor<*xf32>, !_tf.control) -# CHECK-NEXT: %40:3 = "_tf.Switch"(%29#0, %37#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/cond/add"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Switch_1"} : (tensor<*xf32>, tensor) -> (tensor<*xf32>, tensor<*xf32>, !_tf.control) -# CHECK-NEXT: %41:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/strided_slice/stack", value = dense<0> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control) -# CHECK-NEXT: %42:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/strided_slice/stack_1", value = dense<1> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control) -# CHECK-NEXT: %43:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_INT32", name = "tpu/tpu/strided_slice/stack_2", value = dense<1> : tensor<1xi32>} : () -> (tensor<1xi32>, !_tf.control) -# CHECK-NEXT: %44:2 = "_tf.StridedSlice"(%1#0, %41#0, %42#0, %43#0) {Index = "tfdtype$DT_INT32", T = "tfdtype$DT_INT32", begin_mask = 0 : i64, device = "/device:TPU_REPLICATED_CORE", ellipsis_mask = 0 : i64, end_mask = 0 : i64, name = "tpu/tpu/strided_slice", new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> (tensor, !_tf.control) -# CHECK-NEXT: %45:2 = "_tf.ExpandDims"(%44#0, %7#0) {T = "tfdtype$DT_INT32", Tdim = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/ExpandDims"} : (tensor, tensor) -> (tensor<1xi32>, !_tf.control) -# CHECK-NEXT: %46:2 = "_tf.ConcatV2"(%45#0, %3#0, %16#0) {N = 2 : i64, T = "tfdtype$DT_INT32", Tidx = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/concat"} : (tensor<1xi32>, tensor<1xi32>, tensor) -> (tensor<2xi32>, !_tf.control) -# CHECK-NEXT: %47:2 = "_tf.ExpandDims"(%44#0, %12#0) {T = "tfdtype$DT_INT32", Tdim = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/Plus1RNNCellZeroState/ExpandDims"} : (tensor, tensor) -> (tensor<1xi32>, !_tf.control) -# CHECK-NEXT: %48:2 = "_tf.ConcatV2"(%47#0, %10#0, %14#0) {N = 2 : i64, T = "tfdtype$DT_INT32", Tidx = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/Plus1RNNCellZeroState/concat"} : (tensor<1xi32>, tensor<1xi32>, tensor) -> (tensor<2xi32>, !_tf.control) -# CHECK-NEXT: %49:2 = "_tf.Fill"(%48#0, %15#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", index_type = "tfdtype$DT_INT32", name = "tpu/tpu/Plus1RNNCellZeroState/zeros"} : (tensor<2xi32>, tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %50:3 = "_tf.Switch"(%49#0, %27#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/Plus1RNNCellZeroState/zeros"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/Switch_2"} : (tensor, tensor) -> (tensor, tensor, !_tf.control) -# CHECK-NEXT: %51:3 = "_tf.Switch"(%49#0, %27#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/Plus1RNNCellZeroState/zeros"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/add_1/Switch"} : (tensor, tensor) -> (tensor, tensor, !_tf.control) -# CHECK-NEXT: %52:2 = "_tf.Add"(%51#0, %25#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/add_1"} : (tensor, tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %53:3 = "_tf.Switch"(%52#0, %37#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/cond/add_1"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select_1/Switch_1"} : (tensor, tensor) -> (tensor, tensor, !_tf.control) -# CHECK-NEXT: %54:3 = "_tf.Switch"(%52#0, %37#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/cond/add_1"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Switch_2"} : (tensor, tensor) -> (tensor, tensor, !_tf.control) -# CHECK-NEXT: %55:3 = "_tf.Switch"(%51#0, %37#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/Plus1RNNCellZeroState/zeros"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select_1/Switch"} : (tensor, tensor) -> (tensor, tensor, !_tf.control) -# CHECK-NEXT: %56:2 = "_tf.Select"(%38#0, %55#0, %53#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/cond/add_1"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select_1"} : (tensor<1xi1>, tensor, tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %57:3 = "_tf.Merge"(%56#0, %54#1) {N = 2 : i64, T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Merge_1"} : (tensor, tensor) -> (tensor, tensor, !_tf.control) -# CHECK-NEXT: %58:3 = "_tf.Merge"(%57#0, %50#1) {N = 2 : i64, T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/Merge_1"} : (tensor, tensor) -> (tensor, tensor, !_tf.control) -# CHECK-NEXT: %59:2 = "_tf.Identity"(%58#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/Identity_1"} : (tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %60 = "_tf._Retval"(%59#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE:0", index = 1 : i64, name = "tpu_identity_1_0_retval_RetVal"} : (tensor) -> !_tf.control -# CHECK-NEXT: %61:2 = "_tf.ExpandDims"(%44#0, %13#0) {T = "tfdtype$DT_INT32", Tdim = "tfdtype$DT_INT32", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/Plus1RNNCellZeroState/ExpandDims_1"} : (tensor, tensor) -> (tensor<1xi32>, !_tf.control) -# CHECK-NEXT: %62:2 = "_tf.Const"() {device = "/device:TPU_REPLICATED_CORE", dtype = "tfdtype$DT_FLOAT", name = "tpu/tpu/zeros/Const", value = dense<0.000000e+00> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %63:2 = "_tf.Fill"(%46#0, %62#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", index_type = "tfdtype$DT_INT32", name = "tpu/tpu/zeros"} : (tensor<2xi32>, tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %64:3 = "_tf.Switch"(%63#0, %27#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/zeros"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/Switch_1"} : (tensor, tensor) -> (tensor, tensor, !_tf.control) -# CHECK-NEXT: %65:3 = "_tf.Switch"(%63#0, %27#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/zeros"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select/Switch_2"} : (tensor, tensor) -> (tensor, tensor, !_tf.control) -# CHECK-NEXT: %66:3 = "_tf.Switch"(%65#0, %37#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/zeros"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select/Switch_3"} : (tensor, tensor) -> (tensor, tensor, !_tf.control) -# CHECK-NEXT: %67:2 = "_tf.Select"(%38#0, %66#0, %39#0) {T = "tfdtype$DT_FLOAT", _class = ["loc:@tpu/tpu/cond/add"], device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Select"} : (tensor<1xi1>, tensor, tensor<*xf32>) -> (tensor, !_tf.control) -# CHECK-NEXT: %68:3 = "_tf.Merge"(%67#0, %40#1) {N = 2 : i64, T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/cond/Merge"} : (tensor, tensor<*xf32>) -> (tensor<*xf32>, tensor, !_tf.control) -# CHECK-NEXT: %69:3 = "_tf.Merge"(%68#0, %64#1) {N = 2 : i64, T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE", name = "tpu/tpu/cond/Merge"} : (tensor<*xf32>, tensor) -> (tensor<*xf32>, tensor, !_tf.control) -# CHECK-NEXT: %70:2 = "_tf.Pack"(%69#0) {N = 1 : i64, T = "tfdtype$DT_FLOAT", axis = 0 : i64, device = "/device:TPU_REPLICATED_CORE:0", name = "tpu/packed"} : (tensor<*xf32>) -> (tensor<*xf32>, !_tf.control) -# CHECK-NEXT: %71:2 = "_tf.Identity"(%70#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE:0", name = "tpu/Identity"} : (tensor<*xf32>) -> (tensor<*xf32>, !_tf.control) -# CHECK-NEXT: %72 = "_tf._Retval"(%71#0) {T = "tfdtype$DT_FLOAT", device = "/device:TPU_REPLICATED_CORE:0", index = 0 : i64, name = "tpu_identity_0_retval_RetVal"} : (tensor<*xf32>) -> !_tf.control -# CHECK-NEXT: return -# CHECK-NEXT: } From 6cd69820a7ec68363647bf918d312b5d10e0e07a Mon Sep 17 00:00:00 2001 From: Andrew Audibert Date: Tue, 23 Jul 2019 15:40:22 -0700 Subject: [PATCH 0422/3053] Preserve element shape across TensorArray component serde This change addresses the problem raised in #30685, where passing a TensorArray out of a tf.reduce loop would cause it to lose its inferred shape. The issue was that when restoring the TensorArray with _from_components, we would set the _element_shape of the TensorArray wrapper class, but this field is never used. We need to set the _element_shape of the wrapped TenseorArray implementation, either _GraphTensorArray, _GraphTensorArrayV2, or _EagerTensorArray. PiperOrigin-RevId: 259626673 --- tensorflow/python/data/util/structure_test.py | 17 +++++++++++++++++ tensorflow/python/ops/tensor_array_ops.py | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/data/util/structure_test.py b/tensorflow/python/data/util/structure_test.py index c8fdfed740f..290dc99df27 100644 --- a/tensorflow/python/data/util/structure_test.py +++ b/tensorflow/python/data/util/structure_test.py @@ -373,6 +373,23 @@ class StructureTest(test_base.DatasetTestBase, parameterized.TestCase, self.assertEqual(st_after.dense_shape.shape.as_list(), st.dense_shape.shape.as_list()) + def testPreserveTensorArrayShape(self): + ta = tensor_array_ops.TensorArray( + dtype=dtypes.int32, size=1, element_shape=(3,)) + ta_s = structure.type_spec_from_value(ta) + ta_after = structure.from_tensor_list(ta_s, + structure.to_tensor_list(ta_s, ta)) + self.assertEqual(ta_after.element_shape.as_list(), [3]) + + def testPreserveInferredTensorArrayShape(self): + ta = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=1) + # Shape is inferred from the write. + ta = ta.write(0, [1, 2, 3]) + ta_s = structure.type_spec_from_value(ta) + ta_after = structure.from_tensor_list(ta_s, + structure.to_tensor_list(ta_s, ta)) + self.assertEqual(ta_after.element_shape.as_list(), [3]) + def testIncompatibleStructure(self): # Define three mutually incompatible values/structures, and assert that: # 1. Using one structure to flatten a value with an incompatible structure diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py index 41802aabbb4..fab83c6073f 100644 --- a/tensorflow/python/ops/tensor_array_ops.py +++ b/tensorflow/python/ops/tensor_array_ops.py @@ -1318,7 +1318,7 @@ class TensorArraySpec(type_spec.TypeSpec): flow=tensor_list[0], dynamic_size=self._dynamic_size, infer_shape=self._infer_shape) - ret._element_shape = [self._element_shape] # pylint: disable=protected-access + ret._implementation._element_shape = [self._element_shape] # pylint: disable=protected-access return ret @staticmethod From 01292a6f7c7f7a34f29b60c7d035d0fa432c30ad Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Tue, 23 Jul 2019 15:46:08 -0700 Subject: [PATCH 0423/3053] [XLA] BUILD visibility fix PiperOrigin-RevId: 259627820 --- tensorflow/compiler/xla/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD index eeb598b165b..2bafc74c198 100644 --- a/tensorflow/compiler/xla/BUILD +++ b/tensorflow/compiler/xla/BUILD @@ -12,6 +12,7 @@ package( package_group( name = "friends", + includes = ["//tensorflow:internal"], packages = [ "//tensorflow/compiler/...", "//tensorflow/contrib/tpu/...", From 49995f38b6beb602685426b8ad08208520539bcc Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Tue, 23 Jul 2019 16:05:40 -0700 Subject: [PATCH 0424/3053] roll back one commit --- tensorflow/core/kernels/cudnn_rnn_ops.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc index 6ca6b47988c..55e8bc134bc 100644 --- a/tensorflow/core/kernels/cudnn_rnn_ops.cc +++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc @@ -1041,7 +1041,7 @@ class CudnnRNNKernelCommon : public OpKernel { num_layers, h_num_units, input_size, /*cell_size=*/c_num_units, /*batch_size=*/0, input_mode, rnn_direction_mode(), rnn_mode(), ToDataType::value, algo_config, dropout(), seed(), - /* state_allocator=*/nullptr, /*use_padded_io=*/true); + /* state_allocator=*/nullptr, /*use_padded_io=*/false); if (!rnn_desc_s.ok()) { return FromExecutorStatus(rnn_desc_s); } From 9dcffc254b3f87a66d2fd9cd3e4711482f4d03a7 Mon Sep 17 00:00:00 2001 From: Tong Shen Date: Tue, 23 Jul 2019 16:21:23 -0700 Subject: [PATCH 0425/3053] Correctly handle lifted _Arg nodes in then/else branch: they might be 2 different sets of _Arg nodes. PiperOrigin-RevId: 259634536 --- .../jit/extract_outside_compilation_pass.cc | 128 ++++++++++++------ 1 file changed, 89 insertions(+), 39 deletions(-) diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc index d9c106044d5..85fb69b620d 100644 --- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc +++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc @@ -525,14 +525,11 @@ xla::StatusOr> UpdateTypesAttribute( void AddEdgesFromOutsideCompilationNodes( const int original_arg_count, const int arg_to_input_edge_offset, const std::vector& data_types, - const std::vector>& - lifted_arg_nodes_and_outside_compilation_nodes, - Graph* g, Node* n) { + const std::vector& outside_compilation_nodes, Graph* g, Node* n) { // Add edges from outside compilation nodes to While node. for (int i = original_arg_count; i < data_types.size(); i++) { Node* outside_compilation_node = - lifted_arg_nodes_and_outside_compilation_nodes[i - original_arg_count] - .second; + outside_compilation_nodes[i - original_arg_count]; g->AddEdge(outside_compilation_node, 0, n, i + arg_to_input_edge_offset); } } @@ -574,14 +571,15 @@ Status AddMatchingRetvalNode(const FunctionBody& function_body, void ReplaceLiftedArgNodePlaceholderWithArg( const FunctionBody& function_body, const int original_arg_count, - const int arg_idx, - const std::vector>& - lifted_arg_nodes_and_outside_compilation_nodes, + const int arg_idx, const std::vector& lifted_arg_nodes, Node* arg_node) { - Node* lifted_arg_node = - lifted_arg_nodes_and_outside_compilation_nodes[arg_idx - - original_arg_count] - .first; + Node* lifted_arg_node = lifted_arg_nodes[arg_idx - original_arg_count]; + // This might happen because lifted_arg_node only exists in one branch of an + // If node, and we are handling the other branch. + if (!lifted_arg_node) { + return; + } + for (const Edge* e : lifted_arg_node->out_edges()) { if (e->IsControlEdge()) { function_body.graph->AddControlEdge(arg_node, e->dst()); @@ -589,7 +587,6 @@ void ReplaceLiftedArgNodePlaceholderWithArg( function_body.graph->AddEdge(arg_node, 0, e->dst(), e->dst_input()); } } - function_body.graph->RemoveNode(lifted_arg_node); } @@ -630,13 +627,25 @@ Status PostprocessLiftedArgsForWhile( n)); // Add edges from outside compilation nodes to While node. - AddEdgesFromOutsideCompilationNodes( - original_arg_count, - /*arg_to_input_edge_offset=*/0, data_types, - lifted_arg_nodes_and_outside_compilation_nodes, g, n); + std::vector outside_compilation_nodes; + std::transform( + lifted_arg_nodes_and_outside_compilation_nodes.begin(), + lifted_arg_nodes_and_outside_compilation_nodes.end(), + std::back_inserter(outside_compilation_nodes), + [](const std::pair& pair) { return pair.second; }); + AddEdgesFromOutsideCompilationNodes(original_arg_count, + /*arg_to_input_edge_offset=*/0, + data_types, outside_compilation_nodes, g, + n); // In body_graph, create new _Arg/_Retval nodes, and replace lifted arg // nodes with the new _Arg nodes. + std::vector lifted_arg_nodes; + std::transform( + lifted_arg_nodes_and_outside_compilation_nodes.begin(), + lifted_arg_nodes_and_outside_compilation_nodes.end(), + std::back_inserter(lifted_arg_nodes), + [](const std::pair& pair) { return pair.first; }); for (int i = original_arg_count; i < data_types.size(); i++) { TF_ASSIGN_OR_RETURN(Node * arg_node, AddOutsideCompilationInputArgToFunctionBody( @@ -646,8 +655,7 @@ Status PostprocessLiftedArgsForWhile( AddMatchingRetvalNode(*body_function_body, i, data_types[i], arg_node)); ReplaceLiftedArgNodePlaceholderWithArg( - *body_function_body, original_arg_count, i, - lifted_arg_nodes_and_outside_compilation_nodes, arg_node); + *body_function_body, original_arg_count, i, lifted_arg_nodes, arg_node); } FunctionDef rewritten_body_function_def; @@ -730,20 +738,53 @@ Status PostprocessLiftedArgsForIf( LiftedArgsAndOutsideCompilationNodesInFunctionBody( *else_branch_function_body, outside_compilation_attr_to_node)); + // Merge lifted args from then and else branches. + std::vector outside_compilation_nodes; + std::vector then_branch_lifted_arg_nodes; + for (const auto& pair : + then_branch_lifted_arg_nodes_and_outside_compilation_nodes) { + outside_compilation_nodes.push_back(pair.second); + then_branch_lifted_arg_nodes.push_back(pair.first); + } + for (const auto& pair : + else_branch_lifted_arg_nodes_and_outside_compilation_nodes) { + if (std::find(outside_compilation_nodes.begin(), + outside_compilation_nodes.end(), + pair.second) == outside_compilation_nodes.end()) { + outside_compilation_nodes.push_back(pair.second); + // Then branch does not contain this lifted arg. Add an empty item to + // then_branch_lifted_arg_nodes. + then_branch_lifted_arg_nodes.push_back(nullptr); + } + } + // Reorder else_branch_lifted_arg_nodes_and_outside_compilation_nodes. + std::vector else_branch_lifted_arg_nodes( + outside_compilation_nodes.size()); + for (const auto& pair : + else_branch_lifted_arg_nodes_and_outside_compilation_nodes) { + auto iter = std::find(outside_compilation_nodes.begin(), + outside_compilation_nodes.end(), pair.second); + TF_RET_CHECK(iter != outside_compilation_nodes.end()); + int index = iter - outside_compilation_nodes.begin(); + else_branch_lifted_arg_nodes[index] = pair.first; + } + // Append lifted args' types to If node's Tin attribute. - TF_ASSIGN_OR_RETURN( - std::vector data_types, - UpdateTypesAttribute( - then_branch_lifted_arg_nodes_and_outside_compilation_nodes, "Tin", - n)); + std::vector data_types; + TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "Tin", &data_types)); + for (Node* n : outside_compilation_nodes) { + data_types.push_back(n->output_type(0)); + } + n->ClearAttr("Tin"); + n->AddAttr("Tin", data_types); // Add edges from outside compilation nodes to If node. If node's input #0 // is predicate input, input #1 maps to _Arg #0 of branch functions, thus // arg_to_input_edge_offset is set to 1. - AddEdgesFromOutsideCompilationNodes( - original_arg_count, - /*arg_to_input_edge_offset=*/1, data_types, - then_branch_lifted_arg_nodes_and_outside_compilation_nodes, g, n); + AddEdgesFromOutsideCompilationNodes(original_arg_count, + /*arg_to_input_edge_offset=*/1, + data_types, outside_compilation_nodes, g, + n); for (int i = original_arg_count; i < data_types.size(); ++i) { TF_ASSIGN_OR_RETURN(Node * then_branch_arg_node, @@ -752,8 +793,7 @@ Status PostprocessLiftedArgsForIf( ReplaceLiftedArgNodePlaceholderWithArg( *then_branch_function_body, original_arg_count, i, - then_branch_lifted_arg_nodes_and_outside_compilation_nodes, - then_branch_arg_node); + then_branch_lifted_arg_nodes, then_branch_arg_node); TF_ASSIGN_OR_RETURN(Node * else_branch_arg_node, AddOutsideCompilationInputArgToFunctionBody( @@ -761,8 +801,7 @@ Status PostprocessLiftedArgsForIf( ReplaceLiftedArgNodePlaceholderWithArg( *else_branch_function_body, original_arg_count, i, - else_branch_lifted_arg_nodes_and_outside_compilation_nodes, - else_branch_arg_node); + else_branch_lifted_arg_nodes, else_branch_arg_node); } FunctionDef rewritten_then_branch_function_def; @@ -819,14 +858,19 @@ Status PostprocessLiftedArgsForCall( data_types.push_back(data_type); } + std::vector lifted_arg_nodes; + std::transform( + lifted_arg_nodes_and_outside_compilation_nodes.begin(), + lifted_arg_nodes_and_outside_compilation_nodes.end(), + std::back_inserter(lifted_arg_nodes), + [](const std::pair& pair) { return pair.first; }); for (int i = original_arg_count; i < data_types.size(); ++i) { TF_ASSIGN_OR_RETURN( Node * arg_node, AddOutsideCompilationInputArgToFunctionBody(*fbody, i, data_types[i])); - ReplaceLiftedArgNodePlaceholderWithArg( - *fbody, original_arg_count, i, - lifted_arg_nodes_and_outside_compilation_nodes, arg_node); + ReplaceLiftedArgNodePlaceholderWithArg(*fbody, original_arg_count, i, + lifted_arg_nodes, arg_node); } FunctionDef rewritten_fdef; @@ -847,10 +891,16 @@ Status PostprocessLiftedArgsForCall( TF_ASSIGN_OR_RETURN(n, ReplaceNode(g, n, node_def)); // Add edges from outside compilation nodes to call node. - AddEdgesFromOutsideCompilationNodes( - original_arg_count, - /*arg_to_input_edge_offset=*/0, data_types, - lifted_arg_nodes_and_outside_compilation_nodes, g, n); + std::vector outside_compilation_nodes; + std::transform( + lifted_arg_nodes_and_outside_compilation_nodes.begin(), + lifted_arg_nodes_and_outside_compilation_nodes.end(), + std::back_inserter(outside_compilation_nodes), + [](const std::pair& pair) { return pair.second; }); + AddEdgesFromOutsideCompilationNodes(original_arg_count, + /*arg_to_input_edge_offset=*/0, + data_types, outside_compilation_nodes, g, + n); return Status::OK(); } From 21f3ac1cabe8c81f7a3127d5e0af2f9b6655e1c3 Mon Sep 17 00:00:00 2001 From: Thomas O'Malley Date: Tue, 23 Jul 2019 16:24:35 -0700 Subject: [PATCH 0426/3053] Allow Keras Tensors in args and kwargs during Functional API construction for single code path. PiperOrigin-RevId: 259635095 --- tensorflow/python/keras/engine/base_layer.py | 104 +----------------- .../python/keras/engine/base_layer_utils.py | 16 --- tensorflow/python/keras/engine/network.py | 69 ++---------- .../python/keras/engine/network_test.py | 86 +++++++++++++++ tensorflow/python/keras/engine/node.py | 14 +++ tensorflow/python/keras/engine/training.py | 28 +++++ .../python/keras/model_subclassing_test.py | 76 ++++--------- 7 files changed, 169 insertions(+), 224 deletions(-) diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py index 9757a71c5b0..b193f092374 100644 --- a/tensorflow/python/keras/engine/base_layer.py +++ b/tensorflow/python/keras/engine/base_layer.py @@ -20,7 +20,6 @@ from __future__ import print_function import collections import functools -import inspect # Necessary supplement to tf_inspect to deal with variadic args. import itertools import json import threading @@ -73,7 +72,6 @@ from tensorflow.python.util import deprecation from tensorflow.python.util import nest from tensorflow.python.util import object_identity from tensorflow.python.util import serialization -from tensorflow.python.util import tf_decorator from tensorflow.python.util import tf_inspect from tensorflow.python.util.tf_export import keras_export from tensorflow.tools.docs import doc_controls @@ -197,8 +195,6 @@ class Layer(module.Module): self._metrics = [] self._set_dtype_and_policy(dtype) - self._call_convention = (base_layer_utils - .CallConvention.EXPLICIT_INPUTS_ARGUMENT) # Dependencies tracked via attribute assignment. self._maybe_create_attribute('_layers', []) @@ -1792,27 +1788,6 @@ class Layer(module.Module): return args_dict[arg_name] def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs): - call_convention = getattr( - self, '_call_convention', - base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT) - if args: - if call_convention == (base_layer_utils - .CallConvention.EXPLICIT_INPUTS_ARGUMENT): - raise TypeError( - 'This layer ("{}") takes an `inputs` argument in `call()`, ' - 'and only the `inputs` argument may be specified as a positional ' - 'argument. Pass everything else as a keyword argument ' - '(those arguments will not be tracked ' - 'as inputs to the layer).'.format(self.name)) - elif call_convention == (base_layer_utils - .CallConvention.SINGLE_POSITIONAL_ARGUMENT): - raise TypeError( - 'This layer ("{}") takes a single positional argument in `call()`,' - ' which is by convention the `inputs` argument, ' - 'and only this argument may be specified as a positional argument. ' - 'Pass everything else as a keyword argument ' - '(those arguments will not be tracked ' - 'as inputs to the layer).'.format(self.name)) # If the layer returns tensors from its inputs, unmodified, # we copy them to avoid loss of tensor metadata. @@ -1826,85 +1801,16 @@ class Layer(module.Module): output_ls_copy.append(x) outputs = nest.pack_sequence_as(outputs, output_ls_copy) - inputs, kwargs = self._inputs_from_call_args( - call_args=(inputs,) + args, call_kwargs=kwargs) + # Ignore `inputs` arg. + arguments = dict(zip(self._call_fn_args[1:], args)) + arguments.update(kwargs) + # Add an inbound node to the layer, so it can keep track of this call. # This updates the layer history of the output tensor(s). self._add_inbound_node( - input_tensors=inputs, output_tensors=outputs, arguments=kwargs) + input_tensors=inputs, output_tensors=outputs, arguments=arguments) return inputs, outputs - def _inputs_from_call_args(self, call_args, call_kwargs): - """Get Layer inputs from __call__ *args and **kwargs. - - Args: - call_args: The positional arguments passed to __call__. - call_kwargs: The keyword argument dict passed to __call__. - - Returns: - A tuple of (inputs, non_input_kwargs). These may be the same objects as - were passed in (call_args and call_kwargs). - """ - call_convention = getattr( - self, '_call_convention', - base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT) - if (call_convention in ( - base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT, - base_layer_utils.CallConvention.SINGLE_POSITIONAL_ARGUMENT)): - assert len(call_args) == 1 # TypeError raised earlier in __call__. - return call_args[0], call_kwargs - else: - call_arg_spec = tf_inspect.getfullargspec(self.call) - # There is no explicit "inputs" argument expected or provided to - # call(). Arguments which have default values are considered non-inputs, - # and arguments without are considered inputs. - if call_arg_spec.defaults: - if call_arg_spec.varargs is not None: - raise TypeError( - 'Layers may not accept both positional arguments and ' - 'arguments with default values (unable to determine which ' - 'are inputs to the layer). ' - 'Issue occurred with layer "%s"' % (self.name)) - keyword_arg_names = set( - call_arg_spec.args[-len(call_arg_spec.defaults):]) - else: - keyword_arg_names = set() - # Training is never an input argument name, to allow signatures like - # call(x, training). - keyword_arg_names.add('training') - _, unwrapped_call = tf_decorator.unwrap(self.call) - bound_args = inspect.getcallargs( - unwrapped_call, *call_args, **call_kwargs) - if call_arg_spec.varkw is not None: - var_kwargs = bound_args.pop(call_arg_spec.varkw) - bound_args.update(var_kwargs) - keyword_arg_names = keyword_arg_names.union(var_kwargs.keys()) - all_args = call_arg_spec.args - if all_args and bound_args[all_args[0]] is self: - # Ignore the 'self' argument of methods - bound_args.pop(call_arg_spec.args[0]) - all_args = all_args[1:] - non_input_arg_values = {} - input_arg_values = [] - remaining_args_are_keyword = False - for argument_name in all_args: - if argument_name in keyword_arg_names: - remaining_args_are_keyword = True - else: - if remaining_args_are_keyword: - raise TypeError( - 'Found a positional argument in a layer call after a non-input ' - 'argument. All arguments after "training" must be keyword ' - 'arguments, and are not tracked as inputs to the layer. ' - 'Issue occurred with layer "%s"' % (self.name)) - if remaining_args_are_keyword: - non_input_arg_values[argument_name] = bound_args[argument_name] - else: - input_arg_values.append(bound_args[argument_name]) - if call_arg_spec.varargs is not None: - input_arg_values.extend(bound_args[call_arg_spec.varargs]) - return input_arg_values, non_input_arg_values - def _add_inbound_node(self, input_tensors, output_tensors, diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py index 14e2cabf39b..ad0c7cc4d02 100644 --- a/tensorflow/python/keras/engine/base_layer_utils.py +++ b/tensorflow/python/keras/engine/base_layer_utils.py @@ -19,8 +19,6 @@ from __future__ import print_function import threading -import enum - from tensorflow.python.distribute import distribution_strategy_context from tensorflow.python.eager import context from tensorflow.python.framework import dtypes @@ -39,20 +37,6 @@ from tensorflow.python.util import tf_contextlib _call_context = threading.local() -class CallConvention(enum.Enum): - """Calling conventions for passing `Layer` inputs to `Layer.call`.""" - # The Layer takes inputs as its first argument, named "inputs" for - # compatibility with the signature of Layer.__call__. This is the mode assumed - # for Layers which are not subclassed Models. - EXPLICIT_INPUTS_ARGUMENT = 1 - # The Layer takes a single positional argument, not named "inputs". It's - # treated like an "inputs" argument. - SINGLE_POSITIONAL_ARGUMENT = 2 - # The Layer has multiple positional arguments to which its inputs should be - # bound. - POSITIONAL_ARGUMENTS_ARE_INPUTS = 3 - - def create_mean_metric(value, name=None): # TODO(psv): Remove this import when b/110718070 is fixed. from tensorflow.python.keras import metrics as metrics_module # pylint: disable=g-import-not-at-top diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py index 9bb23bc90d5..9569bf79a91 100644 --- a/tensorflow/python/keras/engine/network.py +++ b/tensorflow/python/keras/engine/network.py @@ -253,8 +253,6 @@ class Network(base_layer.Layer): kwargs, {'trainable'}, 'Functional models may only specify `name` and `trainable` keyword ' 'arguments during initialization. Got an unexpected argument:') - self._call_convention = (base_layer_utils - .CallConvention.EXPLICIT_INPUTS_ARGUMENT) # Normalize and set self.inputs, self.outputs. if isinstance(inputs, list) and len(nest.flatten(inputs)) == 1: inputs = inputs[0] @@ -378,8 +376,6 @@ class Network(base_layer.Layer): self._call_accepts_kwargs) self._expects_mask_arg = ('mask' in self._call_fn_args or self._call_accepts_kwargs) - call_argspec = tf_inspect.getfullargspec(self.call) - self._call_convention = self._determine_call_convention(call_argspec) self.outputs = [] self.inputs = [] self.built = False @@ -390,45 +386,6 @@ class Network(base_layer.Layer): return any(layer.dynamic for layer in self.layers) return self._dynamic or any(layer.dynamic for layer in self.layers) - def _determine_call_convention(self, call_argspec): - """Decides how `self.call()` is invoked. See `CallConvention`.""" - if call_argspec.varargs: - may_take_single_argument = False - else: - try: - # Note: tf_inspect doesn't raise a TypeError when regular inspect would, - # so we need to keep in mind that "getcallargs" may have returned - # something even though we under-specified positional arguments. - all_args = tf_inspect.getcallargs(self.call, None) - self_args = set() - for arg_name, obj in all_args.items(): - if obj is self: - self_args.add(arg_name) - may_take_single_argument = True - except TypeError: - may_take_single_argument = False - if may_take_single_argument: - # A single positional argument (plus "self") is considered equivalent to - # an "inputs" argument. - all_positional_args = len(call_argspec.args) - if call_argspec.defaults is not None: - all_positional_args -= len(call_argspec.defaults) - non_self_positional_args = all_positional_args - for positional_arg_name in call_argspec.args[:all_positional_args]: - if positional_arg_name in self_args: - non_self_positional_args -= 1 - if non_self_positional_args == 1: - if 'inputs' in call_argspec.args[all_positional_args:]: - raise TypeError( - "Model.call() takes a single positional argument (to which " - "inputs are passed by convention) and a separate 'inputs' " - "argument. Unable to determine which arguments are inputs.") - return base_layer_utils.CallConvention.SINGLE_POSITIONAL_ARGUMENT - if 'inputs' in call_argspec.args: - return base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT - else: - return base_layer_utils.CallConvention.POSITIONAL_ARGUMENTS_ARE_INPUTS - def _track_layers(self, layers): """Add Trackable dependencies on a list of Layers.""" weight_layer_index = 0 @@ -863,21 +820,20 @@ class Network(base_layer.Layer): computed_tensors = nest.map_structure( lambda t: tensor_dict[str(id(t))], node.input_tensors) - # Ensure `training` and `mask` arg propagation if applicable. + # Ensure `training` arg propagation if applicable. kwargs = copy.copy(node.arguments) if node.arguments else {} argspec = self._layer_call_argspecs[layer].args if 'training' in argspec: kwargs.setdefault('training', training) - if 'mask' in kwargs: - def _map_mask_if_from_keras_layer(m): - # Replace input mask that originates from a Keras layer with - # its computed value. - m_id = str(id(m)) - return tensor_dict[m_id] if m_id in tensor_dict else m + # Map Keras tensors in kwargs to their computed value. + def _map_tensor_if_from_keras_layer(t): + if isinstance(t, ops.Tensor) and hasattr(t, '_keras_history'): + t_id = str(id(t)) + return tensor_dict[t_id] + return t - kwargs['mask'] = nest.map_structure(_map_mask_if_from_keras_layer, - kwargs['mask']) + kwargs = nest.map_structure(_map_tensor_if_from_keras_layer, kwargs) # Compute outputs. output_tensors = layer(computed_tensors, **kwargs) @@ -1789,11 +1745,10 @@ def _map_graph_network(inputs, outputs): # Update the depth of inbound nodes. # The "depth" of a node is the max of the depths - # of all layers it is connected to. - for inbound_layer, node_index, _, _ in node.iterate_inbound(): - inbound_node = inbound_layer._inbound_nodes[node_index] # pylint: disable=protected-access - previous_depth = nodes_depths.get(inbound_node, 0) - nodes_depths[inbound_node] = max(depth + 1, previous_depth) + # of all nodes it is connected to + 1. + for node_dep in node._get_all_node_dependencies(): + previous_depth = nodes_depths.get(node_dep, 0) + nodes_depths[node_dep] = max(depth + 1, previous_depth) # Handle inputs that are not connected to outputs. # We do not error out here because the inputs may be used to compute losses diff --git a/tensorflow/python/keras/engine/network_test.py b/tensorflow/python/keras/engine/network_test.py index 06454479d80..53a2df6b268 100644 --- a/tensorflow/python/keras/engine/network_test.py +++ b/tensorflow/python/keras/engine/network_test.py @@ -903,6 +903,92 @@ class NetworkConstructionTest(keras_parameterized.TestCase): # Data is not masked, returned values are random. self.assertGreater(history.history['loss'][0], 0.0) + @keras_parameterized.run_all_keras_modes + def test_call_arg_derived_from_keras_layer(self): + + class MyAdd(keras.layers.Layer): + + def call(self, x1, x2): + return x1 + x2 + + input1 = keras.Input(10) + input2 = keras.Input(10) + outputs = MyAdd()(input1, input2) + model = keras.Model([input1, input2], outputs) + model.compile( + 'sgd', + 'mse', + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) + history = model.fit( + x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))], + y=10 * np.ones((10, 10)), + batch_size=2) + # Check that second input was correctly added to first. + self.assertEqual(history.history['loss'][0], 0.0) + + @keras_parameterized.run_all_keras_modes + def test_call_kwarg_derived_from_keras_layer(self): + + class MaybeAdd(keras.layers.Layer): + + def call(self, x1, x2=None): + if x2 is not None: + return x1 + x2 + return x1 + + input1 = keras.Input(10) + input2 = keras.Input(10) + outputs = MaybeAdd()(input1, x2=input2) + model = keras.Model([input1, input2], outputs) + model.compile( + 'sgd', + 'mse', + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) + history = model.fit( + x=[3 * np.ones((10, 10)), 7 * np.ones((10, 10))], + y=10 * np.ones((10, 10)), + batch_size=2) + # Check that second input was correctly added to first. + self.assertEqual(history.history['loss'][0], 0.0) + + @keras_parameterized.run_all_keras_modes + def test_call_nested_arg_derived_from_keras_layer(self): + + class AddAll(keras.layers.Layer): + + def call(self, x1, x2, x3=None): + out = x1 + x2 + if x3 is not None: + for t in x3.values(): + out += t + return out + + input1 = keras.Input(10) + input2 = keras.Input(10) + input3 = keras.Input(10) + outputs = AddAll()( + input1, + 4 * array_ops.ones((1, 10)), + x3={ + 'a': input2, + 'b': input3, + 'c': 5 * array_ops.ones((1, 10)) + }) + model = keras.Model([input1, input2, input3], outputs) + model.compile( + 'sgd', + 'mse', + run_eagerly=testing_utils.should_run_eagerly(), + run_distributed=testing_utils.should_run_distributed()) + history = model.fit( + x=[np.ones((10, 10)), 2 * np.ones((10, 10)), 3 * np.ones((10, 10))], + y=15 * np.ones((10, 10)), + batch_size=2) + # Check that all inputs were correctly added. + self.assertEqual(history.history['loss'][0], 0.0) + @keras_parameterized.run_all_keras_modes def test_multi_output_model_with_none_masking(self): def func(x): diff --git a/tensorflow/python/keras/engine/node.py b/tensorflow/python/keras/engine/node.py index f169fdb14fd..9a7ecb79c47 100644 --- a/tensorflow/python/keras/engine/node.py +++ b/tensorflow/python/keras/engine/node.py @@ -18,6 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from tensorflow.python.framework import ops from tensorflow.python.keras import backend from tensorflow.python.util import nest @@ -130,6 +131,19 @@ class Node(object): nest.flatten(self.inbound_layers), nest.flatten(self.node_indices), nest.flatten(self.tensor_indices), nest.flatten(self.input_tensors)) + def _get_all_node_dependencies(self): + """Returns all of the nodes this node immediately depends on.""" + node_deps = [] + for layer, node_index, _, _ in self.iterate_inbound(): + node_deps.append(layer._inbound_nodes[node_index]) + + for arg in nest.flatten(self.arguments): + if isinstance(arg, ops.Tensor) and hasattr(arg, '_keras_history'): + kh = arg._keras_history + node_deps.append(kh.layer._inbound_nodes[kh.node_index]) + + return node_deps + def get_config(self): inbound_names = nest.map_structure( lambda layer: layer.name if layer else None, self.inbound_layers) diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py index eb10f20fb0d..ee898f825c9 100644 --- a/tensorflow/python/keras/engine/training.py +++ b/tensorflow/python/keras/engine/training.py @@ -686,6 +686,7 @@ class Model(network.Network): if kwargs: raise TypeError('Unrecognized keyword arguments: ' + str(kwargs)) self._assert_compile_was_called() + self._check_call_args('fit') func = self._select_training_loop(x) return func.fit( @@ -798,6 +799,7 @@ class Model(network.Network): """ _keras_api_gauge.get_cell('evaluate').set(True) self._assert_compile_was_called() + self._check_call_args('evaluate') func = self._select_training_loop(x) return func.evaluate( @@ -875,6 +877,7 @@ class Model(network.Network): that is not a multiple of the batch size. """ _keras_api_gauge.get_cell('predict').set(True) + self._check_call_args('predict') func = self._select_training_loop(x) return func.predict( @@ -956,6 +959,7 @@ class Model(network.Network): return outputs self._assert_compile_was_called() + self._check_call_args('train_on_batch') # If at this point we are in the replica context, then it is okay to execute # the Eager code path. The expected way to get here is to call `fit` that # calls `train_on_batch` on each replica. @@ -1048,6 +1052,7 @@ class Model(network.Network): return outputs self._assert_compile_was_called() + self._check_call_args('test_on_batch') if (self._distribution_strategy and distribution_strategy_context.in_cross_replica_context()): raise NotImplementedError('`test_on_batch` is not supported for models ' @@ -1100,6 +1105,7 @@ class Model(network.Network): ValueError: In case of mismatch between given number of inputs and expectations of the model. """ + self._check_call_args('predict_on_batch') if self._run_distributed: return training_v2_utils.predict_on_batch(self, x) @@ -1246,6 +1252,7 @@ class Model(network.Network): raise NotImplementedError('`fit_generator` is not supported for ' 'models compiled with tf.distribute.Strategy.') _keras_api_gauge.get_cell('train').set(True) + self._check_call_args('fit_generator') return training_generator.fit_generator( self, generator, @@ -1319,6 +1326,7 @@ class Model(network.Network): raise NotImplementedError('`evaluate_generator` is not supported for ' 'models compiled with tf.distribute.Strategy.') _keras_api_gauge.get_cell('evaluate').set(True) + self._check_call_args('evaluate_generator') return training_generator.evaluate_generator( self, generator, @@ -1376,6 +1384,7 @@ class Model(network.Network): raise NotImplementedError('`predict_generator` is not supported for ' 'models compiled with tf.distribute.Strategy.') _keras_api_gauge.get_cell('predict').set(True) + self._check_call_args('predict_generator') return training_generator.predict_generator( self, generator, @@ -1386,6 +1395,25 @@ class Model(network.Network): verbose=verbose, callbacks=callbacks) + def _check_call_args(self, method_name): + """Check that `call` has only one positional arg.""" + # Always allow first arg, regardless of arg name. + fullargspec = tf_inspect.getfullargspec(self.call) + if fullargspec.defaults: + positional_args = fullargspec.args[:-len(fullargspec.defaults)] + else: + positional_args = fullargspec.args + if 'training' in positional_args: + positional_args.remove('training') + + # self and first arg can be positional. + if len(positional_args) > 2: + extra_args = positional_args[2:] + raise ValueError( + 'Models passed to `' + method_name + '` can only have `training` ' + 'and the first argument in `call` as positional arguments, ' + 'found: ' + str(extra_args) + '.') + def _prepare_validation_data(self, validation_data, batch_size, validation_steps): """Unpack and check the validation data.""" diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py index 39d6594a318..9cf1932fd4f 100644 --- a/tensorflow/python/keras/model_subclassing_test.py +++ b/tensorflow/python/keras/model_subclassing_test.py @@ -1254,22 +1254,6 @@ class CustomCallSignatureTests(test.TestCase): ValueError, 'cannot build your model if it has positional'): model.build(input_shape=[first_input_shape, second_input_shape]) - def test_inputs_in_signature(self): - - class HasInputsAndOtherPositional(keras.Model): - - def call(self, inputs, some_other_arg, training=False): - return inputs - - def compute_output_shape(self, input_shape): - return input_shape - - model = HasInputsAndOtherPositional() - with self.assertRaisesRegexp( - TypeError, 'everything else as a keyword argument'): - x1, x2 = keras.Input((1, 1)), keras.Input((1, 1)) - model(x1, x2) - def test_kwargs_in_signature(self): class HasKwargs(keras.Model): @@ -1283,34 +1267,6 @@ class CustomCallSignatureTests(test.TestCase): if not context.executing_eagerly(): self.assertEqual(len(model.inputs), 1) - def test_args_in_signature(self): - - class HasArgs(keras.Model): - - def call(self, x, *args, **kwargs): - return [x] + list(args) - - def compute_output_shape(self, input_shape): - return input_shape - - model = HasArgs() - x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1)) - model(x1, x2, x3, a=3) - self.assertEqual(len(model.inputs), 3) - - def test_args_and_keywords_in_signature(self): - - class HasArgs(keras.Model): - - def call(self, x, training=True, *args, **kwargs): # pylint:disable=keyword-arg-before-vararg - return x - - model = HasArgs() - x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1)) - with self.assertRaisesRegexp( - TypeError, 'may not accept both positional arguments and '): - model(x1, x2, x3, a=3) - @test_util.assert_no_new_tensors @test_util.assert_no_garbage_created def test_training_no_default(self): @@ -1323,17 +1279,33 @@ class CustomCallSignatureTests(test.TestCase): model(arg, True) self.assertEqual(len(model.inputs), 1) - def test_training_no_default_with_positional(self): + def test_positional_arg_in_call(self): - class TrainingNoDefaultWithPositional(keras.Model): + class ModelWithPositionalArgs(keras.Model): - def call(self, x, training, positional): - return x + def call(self, x, x2, x3=None): + return x + x2 + + x = np.ones((10, 1)) + y = np.ones((10, 1)) + m = ModelWithPositionalArgs() + m.compile('sgd', 'mse') + with self.assertRaisesRegexp(ValueError, r'Models passed to `fit`'): + m.fit(x, y, batch_size=2) + with self.assertRaisesRegexp(ValueError, r'Models passed to `evaluate`'): + m.evaluate(x, y, batch_size=2) + with self.assertRaisesRegexp(ValueError, r'Models passed to `predict`'): + m.predict(x, batch_size=2) + with self.assertRaisesRegexp(ValueError, + r'Models passed to `train_on_batch`'): + m.train_on_batch(x, y) + with self.assertRaisesRegexp(ValueError, + r'Models passed to `test_on_batch`'): + m.test_on_batch(x, y) + with self.assertRaisesRegexp(ValueError, + r'Models passed to `predict_on_batch`'): + m.predict_on_batch(x) - model = TrainingNoDefaultWithPositional() - x1, x2, x3 = keras.Input((1, 1)), keras.Input((1, 1)), keras.Input((1, 1)) - with self.assertRaisesRegexp(TypeError, 'after a non-input'): - model(x1, x2, x3) if __name__ == '__main__': test.main() From 22caf9b8cb5b5070216d6ec187a67929de9ff4f6 Mon Sep 17 00:00:00 2001 From: Ashwin Murthy Date: Tue, 23 Jul 2019 16:33:17 -0700 Subject: [PATCH 0427/3053] [TFLite] Add a new OpTrait to model stateful tensor operands for LSTM/RNN ops. These will be used in the export to flatbuffer to set additional tensor state (is_variable) for such operands. PiperOrigin-RevId: 259636603 --- tensorflow/compiler/mlir/lite/ir/tfl_ops.cc | 25 +++++++++++++++++++ tensorflow/compiler/mlir/lite/ir/tfl_ops.td | 20 ++++++++++++--- tensorflow/compiler/mlir/lite/ir/tfl_traits.h | 19 ++++++++++++++ 3 files changed, 60 insertions(+), 4 deletions(-) diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc index b79545353f6..23d1388ed72 100644 --- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc +++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc @@ -568,6 +568,31 @@ static LogicalResult Verify(UnpackOp op) { // TODO(b/133854225): Implement shape inference to Mean +//===----------------------------------------------------------------------===// +// LSTMOp +//===----------------------------------------------------------------------===// + +static LogicalResult Verify(LSTMOp op) { + auto operands = op.GetStatefulOperands(); + if (operands.size() == 2 && operands[0] == 18 && operands[1] == 19) { + return success(); + } + return op.emitError("LSTMOp expected to have two stateful operands"); +} + +//===----------------------------------------------------------------------===// +// UnidirectionalSequenceLSTMOp +//===----------------------------------------------------------------------===// + +static LogicalResult Verify(UnidirectionalSequenceLSTMOp op) { + auto operands = op.GetStatefulOperands(); + if (operands.size() == 2 && operands[0] == 18 && operands[1] == 19) { + return success(); + } + return op.emitError( + "UnidirectionalSequenceLSTMOp expected to have two stateful operands"); +} + //===----------------------------------------------------------------------===// // TableGen'd op method definitions //===----------------------------------------------------------------------===// diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td index 8c78f7a9dc8..21f5ce1bf5b 100644 --- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td +++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td @@ -163,8 +163,6 @@ def TFL_IntTensor : TypeAlias; // This is used to represent the type of "ref tensors" or tensors that are // used as variables to track state. -// TODO(ashwinm): This is a placeholder until we have first class support -// for variables. def TFL_StatefulTensor : TypeAlias; // Tensor or None type. @@ -284,6 +282,14 @@ class TFL_AccumulatorUniformScale : NativeOpTrait< // apply quantization on this op. def TFL_NoQuantizableResult : NativeOpTrait<"TFL::NoQuantizableResult">; + +//===----------------------------------------------------------------------===// +// TFL native op trait for stateful operands. + +class StatefulOperands operands> + : ParamNativeOpTrait<"TFL::StatefulOperands", StrJoinInt.result>; + + //===----------------------------------------------------------------------===// // TFL op base class. //===----------------------------------------------------------------------===// @@ -2327,7 +2333,8 @@ def TFL_LSTMOp : [LstmMandatoryInputsConstraint, LstmOptionalPeepholeWeightConstraint, LstmProjectionWeightBiasConstraint, - LstmResultConstraint]> { + LstmResultConstraint, + StatefulOperands<[18, 19]>]> { let summary = "The full lstm operator"; let description = [{ @@ -2405,6 +2412,8 @@ Ba et al. “Layer Normalization” let results = (outs AnyTensor:$output); let hasOptions = 1; + + let verifier = [{ return Verify(*this); }]; } // UnidirectionalSequenceLstm op . @@ -2415,7 +2424,8 @@ def TFL_UnidirectionalSequenceLSTMOp : [LstmMandatoryInputsConstraint, LstmOptionalPeepholeWeightConstraint, LstmProjectionWeightBiasConstraint, - LstmResultConstraint]> { + LstmResultConstraint, + StatefulOperands<[18, 19]>]> { let summary = "Unidirectional sequence lstm operator"; let description = [{ @@ -2482,6 +2492,8 @@ def TFL_UnidirectionalSequenceLSTMOp : let results = (outs AnyTensor:$output); let hasOptions = 1; + + let verifier = [{ return Verify(*this); }]; } #endif // TFL_OPS diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_traits.h b/tensorflow/compiler/mlir/lite/ir/tfl_traits.h index 807c1100b71..97fc87a79f3 100644 --- a/tensorflow/compiler/mlir/lite/ir/tfl_traits.h +++ b/tensorflow/compiler/mlir/lite/ir/tfl_traits.h @@ -120,6 +120,25 @@ class NoQuantizableResult static bool IsQuantizable() { return false; } }; +// The trait to specify that the specified operands of the TFL op are stateful. +// This is used as a trait like this: +// +// class LSTMOp +// : public Op::Impl> { +// +template +class StatefulOperands { + public: + template + class Impl + : public TraitBase::Impl> { + public: + static std::vector GetStatefulOperands() { + return std::vector({Operands...}); + } + }; +}; + } // namespace TFL } // namespace OpTrait } // namespace mlir From b89808261726d50241dbedd16ac99367403650ef Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 16:48:15 -0700 Subject: [PATCH 0428/3053] Adds an experimental C API to expose TF shape inference functions for ops. PiperOrigin-RevId: 259639450 --- tensorflow/c/c_api_experimental.cc | 141 ++++++++++++++++++++++++ tensorflow/c/c_api_experimental.h | 48 ++++++++ tensorflow/c/c_api_experimental_test.cc | 87 +++++++++++++++ 3 files changed, 276 insertions(+) diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc index ad0c4068d45..b37d2e799de 100644 --- a/tensorflow/c/c_api_experimental.cc +++ b/tensorflow/c/c_api_experimental.cc @@ -24,6 +24,8 @@ limitations under the License. #include "tensorflow/compiler/jit/flags.h" #include "tensorflow/core/common_runtime/eager/attr_builder.h" #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h" +#include "tensorflow/core/framework/node_def.pb.h" +#include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/framework/tensor.pb.h" #include "tensorflow/core/graph/graph.h" #include "tensorflow/core/graph/node_builder.h" @@ -995,3 +997,142 @@ TFE_TensorHandle* TFE_ConsumeInputConcreteTensorFromTraceContext( << handle->DebugString(); return ret; } + +TF_ShapeAndTypeList* TF_NewShapeAndTypeList(int num_items) { + TF_ShapeAndTypeList* result = new TF_ShapeAndTypeList; + result->num_items = num_items; + result->items = (num_items == 0) ? nullptr : new TF_ShapeAndType[num_items](); + return result; +} + +void TF_ShapeAndTypeListSetShape(TF_ShapeAndTypeList* shape_list, int index, + const int64_t* dims, int num_dims) { + DCHECK(index >= 0 && index < shape_list->num_items); + TF_ShapeAndType& shape = shape_list->items[index]; + DCHECK(shape.dims == nullptr) << "Shape at " << index << " is already set!"; + DCHECK(num_dims >= 0) << "Number of dimensions cannot be negative!"; + shape.num_dims = num_dims; + shape.dims = new int64_t[num_dims]; + memcpy(shape.dims, dims, sizeof(int64_t) * num_dims); +} + +void TF_ShapeAndTypeListSetUnknownShape(TF_ShapeAndTypeList* shape_list, + int index) { + DCHECK(index >= 0 && index < shape_list->num_items); + TF_ShapeAndType& shape = shape_list->items[index]; + DCHECK(shape.dims == nullptr) << "Shape at " << index << " is already set!"; + shape.num_dims = -1; + shape.dims = nullptr; +} + +void TF_ShapeAndTypeListSetDtype(TF_ShapeAndTypeList* shape_list, int index, + TF_DataType dtype) { + DCHECK(index >= 0 && index < shape_list->num_items); + TF_ShapeAndType& shape_and_type = shape_list->items[index]; + shape_and_type.dtype = dtype; +} + +void TF_DeleteShapeAndTypeList(TF_ShapeAndTypeList* shape_list) { + if (shape_list == nullptr) return; + for (size_t i = 0; i < shape_list->num_items; ++i) { + delete[] shape_list->items[i].dims; + } + delete[] shape_list->items; + delete shape_list; +} + +void TF_DeleteShapeAndTypeListArray(TF_ShapeAndTypeList** shape_list_array, + int num_items) { + if (shape_list_array == nullptr) return; + for (int i = 0; i < num_items; ++i) { + TF_DeleteShapeAndTypeList(shape_list_array[i]); + } + delete[] shape_list_array; +} + +void TFE_InferShapes(TFE_Op* tfe_op, TF_ShapeAndTypeList* input_shapes, + TF_Tensor** input_tensors, int num_input_tensors, + TF_ShapeAndTypeList* input_tensors_as_shapes, + TF_ShapeAndTypeList** input_resource_shapes_and_types, + TF_ShapeAndTypeList** output_shapes, + TF_ShapeAndTypeList*** output_resource_shapes_and_types, + TF_Status* status) { + using tensorflow::NodeDef; + using tensorflow::OpRegistrationData; + using tensorflow::Tensor; + using tensorflow::shape_inference::DimensionHandle; + using tensorflow::shape_inference::InferenceContext; + using tensorflow::shape_inference::ShapeAndType; + using tensorflow::shape_inference::ShapeHandle; + + const int num_inputs = input_shapes->num_items; + NodeDef node_def; + node_def.set_name(tfe_op->operation.Name()); + node_def.set_op(tfe_op->operation.Name()); + for (int i = 0; i < num_inputs; ++i) { + node_def.add_input("dummy_input"); + } + tfe_op->operation.Attrs().FillAttrValueMap(node_def.mutable_attr()); + + const tensorflow::OpRegistrationData* op_reg_data; + status->status = + tensorflow::OpRegistry::Global()->LookUp(node_def.op(), &op_reg_data); + if (!status->status.ok()) return; + + // Create an inference context with dummy values, which will be updated later. + InferenceContext c(TF_GRAPH_DEF_VERSION, &node_def, op_reg_data->op_def, + std::vector(num_inputs), + std::vector(num_inputs, nullptr), {}, + std::vector>>()); + + // Set input_shapes. + for (int i = 0; i < num_inputs; ++i) { + std::vector dims; + const TF_ShapeAndType& input_shape = input_shapes->items[i]; + if (input_shape.num_dims == InferenceContext::kUnknownRank) { + c.SetInput(i, c.UnknownShape()); + continue; + } + for (int j = 0; j < input_shape.num_dims; ++j) { + dims.push_back(c.MakeDim(input_shape.dims[j])); + } + c.SetInput(i, c.MakeShape(dims)); + } + + // TODO(bgogul): Handle input_tensors. + // TODO(bgogul): Handle input_tensors_as_shapes. + // TODO(bgogul): Handle input_resource_shapes_and_types. + + status->status = c.construction_status(); + if (!status->status.ok()) return; + + if (op_reg_data->shape_inference_fn == nullptr) { + status->status = + InvalidArgument("No shape inference function exists for op '", + node_def.op(), "', did you forget to define it?"); + return; + } + + status->status = c.Run(op_reg_data->shape_inference_fn); + if (!status->status.ok()) return; + + // Set output_shapes. + TF_ShapeAndTypeList* output_shapes_result = + TF_NewShapeAndTypeList(c.num_outputs()); + for (int i = 0; i < c.num_outputs(); ++i) { + ShapeHandle shape_handle = c.output(i); + TF_ShapeAndType& shape = output_shapes_result->items[i]; + shape.num_dims = c.Rank(shape_handle); + if (shape.num_dims == InferenceContext::kUnknownRank) { + shape.dims = nullptr; + continue; + } + shape.dims = new int64_t[shape.num_dims]; + for (size_t j = 0; j < shape.num_dims; ++j) { + shape.dims[j] = c.Value(c.Dim(shape_handle, j)); + } + } + if (output_shapes != nullptr) *output_shapes = output_shapes_result; + + // TODO(bgogul): Set output_resource_shapes_and_types. +} diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h index d91f3ab8b05..36028fd04ce 100644 --- a/tensorflow/c/c_api_experimental.h +++ b/tensorflow/c/c_api_experimental.h @@ -343,6 +343,54 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_ConsumeInputConcreteTensorFromTraceContext(TFE_TraceContext* trace_ctx, unsigned int idx); +// Information about the shape of a Tensor and its type. +struct TF_ShapeAndType { + // Number of dimensions. -1 indicates unknown rank. + int num_dims; + // Array of dimensions. -1 indicates unknown dim. + int64_t* dims; + // The data type. May be 0 to denote unknown type. + TF_DataType dtype; +}; + +typedef struct TF_ShapeAndType TF_ShapeAndType; + +// A list of TF_ShapeAndType elements.. +struct TF_ShapeAndTypeList { + int num_items; + TF_ShapeAndType* items; +}; +typedef struct TF_ShapeAndTypeList TF_ShapeAndTypeList; + +// API for manipulating TF_ShapeAndTypeList objects. +// +TF_CAPI_EXPORT extern TF_ShapeAndTypeList* TF_NewShapeAndTypeList( + int num_shapes); +TF_CAPI_EXPORT extern void TF_ShapeAndTypeListSetShape( + TF_ShapeAndTypeList* shape_list, int index, const int64_t* dims, + int num_dims); +TF_CAPI_EXPORT extern void TF_ShapeAndTypeListSetUnknownShape( + TF_ShapeAndTypeList* shape_list, int index); +TF_CAPI_EXPORT extern void TF_ShapeAndTypeListSetDtype( + TF_ShapeAndTypeList* shape_list, int index, TF_DataType dtype); +TF_CAPI_EXPORT extern void TF_DeleteShapeAndTypeList( + TF_ShapeAndTypeList* shape_list); +TF_CAPI_EXPORT extern void TF_DeleteShapeAndTypeListArray( + TF_ShapeAndTypeList** shape_list_array, int num_items); + +// Infer shapes for the given `node_def`. The arguments mimic the arguments of +// the `shape_inference::InferenceContext` constructor. The types need not be +// set in `input_shapes` as it is not used for shape inference. +// +// The results are returned in `output_shapes` and +// `output_resource_shapes_and_types`. The caller is responsible for freeing the +// memory in these buffers by calling `TF_DeleteShapeAndTypeList`. +TF_CAPI_EXPORT extern void TFE_InferShapes( + TFE_Op* op, TF_ShapeAndTypeList* input_shapes, TF_Tensor** input_tensors, + int num_input_tensors, TF_ShapeAndTypeList* input_tensor_as_shapes, + TF_ShapeAndTypeList** input_resource_shapes_and_types, + TF_ShapeAndTypeList** output_shapes, + TF_ShapeAndTypeList*** output_resource_shapes_and_types, TF_Status* status); #ifdef __cplusplus } /* end extern "C" */ #endif diff --git a/tensorflow/c/c_api_experimental_test.cc b/tensorflow/c/c_api_experimental_test.cc index 55f3a8599fd..f4f6753e8b7 100644 --- a/tensorflow/c/c_api_experimental_test.cc +++ b/tensorflow/c/c_api_experimental_test.cc @@ -431,5 +431,92 @@ TEST_F(AddEagerOpToGraphTest, TFE_DeleteTensorHandle(matrix); } +class ShapeInferenceTest : public ::testing::Test { + protected: + ShapeInferenceTest() + : status_(TF_NewStatus()), tfe_context_options_(TFE_NewContextOptions()) { + tfe_context_ = TFE_NewContext(tfe_context_options_, status_); + CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_); + matmul_op_ = TFE_NewOp(tfe_context_, "MatMul", status_); + CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_); + } + + ~ShapeInferenceTest() override { + TFE_DeleteOp(matmul_op_); + TFE_DeleteContextOptions(tfe_context_options_); + TFE_DeleteContext(tfe_context_); + TF_DeleteStatus(status_); + } + + void infer_matmul_shapes(TF_ShapeAndTypeList* input_shapes, + int64_t expected_rank, int64_t expected_first_dim, + int64_t expected_second_dim) { + TF_ShapeAndTypeList* output_shapes; + TFE_InferShapes(matmul_op_, input_shapes, + /*input_tensors*/ nullptr, /*num_input_tensors*/ 0, + /*input_tensors_as_shapes*/ nullptr, + /*input_resource_shapes_and_types*/ nullptr, &output_shapes, + /*output_resource_shapes_and_types*/ nullptr, status_); + CHECK_EQ(TF_OK, TF_GetCode(status_)) << TF_Message(status_); + CHECK_EQ(output_shapes->num_items, 1); + EXPECT_EQ(output_shapes->items[0].num_dims, expected_rank); + if (expected_rank == 2) { + EXPECT_EQ(output_shapes->items[0].dims[0], expected_first_dim); + EXPECT_EQ(output_shapes->items[0].dims[1], expected_second_dim); + } + TF_DeleteShapeAndTypeList(input_shapes); + TF_DeleteShapeAndTypeList(output_shapes); + } + + TF_Status* status_; + TFE_ContextOptions* tfe_context_options_; + TFE_Context* tfe_context_; + TFE_Op* matmul_op_; +}; + +TEST_F(ShapeInferenceTest, InfersShapes) { + // Infer shape when everything is known. + int64_t _3by2[] = {3, 2}; + int64_t _2by4[] = {2, 4}; + TF_ShapeAndTypeList* input_shapes = TF_NewShapeAndTypeList(/*num_shapes*/ 2); + TF_ShapeAndTypeListSetShape(input_shapes, 0, _3by2, 2); + TF_ShapeAndTypeListSetShape(input_shapes, 1, _2by4, 2); + infer_matmul_shapes(input_shapes, /*expected_rank*/ 2, + /*expected_first_dim*/ 3, /*expected_second_dim*/ 4); + + // Infer shape when second operand has unknown shape. + TF_ShapeAndTypeList* input_shapes_unknown_second = + TF_NewShapeAndTypeList(/*num_shapes*/ 2); + TF_ShapeAndTypeListSetShape(input_shapes_unknown_second, 0, _3by2, 2); + TF_ShapeAndTypeListSetUnknownShape(input_shapes_unknown_second, 1); + infer_matmul_shapes( + input_shapes_unknown_second, /*expected_rank*/ 2, + /*expected_first_dim*/ 3, + /*expected_second_dim*/ shape_inference::InferenceContext::kUnknownDim); + + // Infer shape when some dimensions are unknown. + int64_t _unknownby2[] = {-1, 2}; + TF_ShapeAndTypeList* input_shapes_unknown_dims = + TF_NewShapeAndTypeList(/*num_shapes*/ 2); + TF_ShapeAndTypeListSetShape(input_shapes_unknown_dims, 0, _unknownby2, 2); + TF_ShapeAndTypeListSetShape(input_shapes_unknown_dims, 1, _2by4, 2); + infer_matmul_shapes( + input_shapes_unknown_dims, /*expected_rank*/ 2, + /*expected_first_dim*/ shape_inference::InferenceContext::kUnknownDim, + /*expected_second_dim*/ 4); + + // Infer shape when everything is unknown. + TF_ShapeAndTypeList* unknown_shapes = + TF_NewShapeAndTypeList(/*num_shapes*/ 2); + TF_ShapeAndTypeListSetUnknownShape(unknown_shapes, 0); + TF_ShapeAndTypeListSetUnknownShape(unknown_shapes, 1); + infer_matmul_shapes( + unknown_shapes, /*expected_rank*/ 2, + /*expected_first_dim*/ shape_inference::InferenceContext::kUnknownDim, + /*expected_second_dim*/ shape_inference::InferenceContext::kUnknownDim); + + // TODO(bgogul): Add some death tests where status is not OK. +} + } // namespace } // namespace tensorflow From 19c2d6a5fef6402ec19d9680a63cd85d5b587ab7 Mon Sep 17 00:00:00 2001 From: Saurabh Saxena Date: Tue, 23 Jul 2019 16:51:47 -0700 Subject: [PATCH 0429/3053] Implement TensorListGather in xla. PiperOrigin-RevId: 259640097 --- .../compiler/tests/tensor_list_ops_test.py | 21 ++++++-- .../tf2xla/kernels/tensor_list_ops.cc | 54 +++++++++++++++++++ 2 files changed, 72 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/tests/tensor_list_ops_test.py b/tensorflow/compiler/tests/tensor_list_ops_test.py index b24e807b034..7d2425ee205 100644 --- a/tensorflow/compiler/tests/tensor_list_ops_test.py +++ b/tensorflow/compiler/tests/tensor_list_ops_test.py @@ -19,6 +19,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function import os +from absl.testing import parameterized import numpy as np from tensorflow.compiler.tests import xla_test from tensorflow.python.framework import constant_op @@ -29,7 +30,7 @@ from tensorflow.python.ops import list_ops from tensorflow.python.platform import test -class ListOpsTest(xla_test.XLATestCase): +class ListOpsTest(parameterized.TestCase, xla_test.XLATestCase): def testElementShape(self): with self.session() as sess, self.test_scope(): @@ -204,6 +205,20 @@ class ListOpsTest(xla_test.XLATestCase): self.assertAllEqual(t.shape.as_list(), [None]) self.assertAllEqual(t, [1.0, 2.0]) + @parameterized.named_parameters( + ("FlatList", [1.0, 2.0, 3.0], [], [0, 2], [1.0, 3.0]), + ("NestedList", [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0] + ], [2], [1], [[3.0, 4.0]]), + ("EmptyIndices", [1.0, 2.0, 3.0], [], [], []), + ) + def testGather(self, input_list, element_shape, indices, output): + with self.session(), self.test_scope(): + tensor_list = list_ops.tensor_list_from_tensor( + input_list, element_shape=element_shape) + gather_t = list_ops.tensor_list_gather( + tensor_list, indices, element_dtype=dtypes.float32) + self.assertAllEqual(gather_t, output) + def testStackWithUninitializedTensors(self): with self.session(), self.test_scope(): l = list_ops.tensor_list_reserve( @@ -224,6 +239,6 @@ class ListOpsTest(xla_test.XLATestCase): self.assertAllEqual(z, [0.0, 0.0]) if __name__ == "__main__": - os.environ['TF_XLA_FLAGS'] = ('--tf_xla_min_cluster_size=2 ' + - os.environ.get('TF_XLA_FLAGS', '')) + os.environ["TF_XLA_FLAGS"] = ("--tf_xla_min_cluster_size=2 " + + os.environ.get("TF_XLA_FLAGS", "")) test.main() diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc index ac3d2c22d65..4af3d4233dd 100644 --- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h" #include "tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h" #include "tensorflow/compiler/tf2xla/shape_util.h" #include "tensorflow/compiler/tf2xla/type_util.h" @@ -307,6 +308,59 @@ class TensorListGetItemOp : public XlaOpKernel { REGISTER_XLA_OP(Name("TensorListGetItem"), TensorListGetItemOp); +class TensorListGatherOp : public XlaOpKernel { + public: + explicit TensorListGatherOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("element_dtype", &dtype_)); + } + + void Compile(XlaOpKernelContext* ctx) override { + // Check that the TensorList is initialized. + bool is_initialized; + OP_REQUIRES_OK(ctx, + (IsTensorListInitialized(ctx->Input(0), &is_initialized))); + OP_REQUIRES(ctx, is_initialized, + errors::InvalidArgument("TensorList is not initialized")); + + // Only non-nested TensorList is supported for now. + bool is_nested; + OP_REQUIRES_OK(ctx, IsNestedTensorList(ctx->Input(0), &is_nested)); + OP_REQUIRES(ctx, !is_nested, + errors::Unimplemented("Only non-nested TensorList is supported " + "for TensorListGather.")); + + DataType indices_type = ctx->input_type(1); + + const TensorShape indices_shape = ctx->InputShape(1); + OP_REQUIRES(ctx, indices_shape.dims() == 1, + errors::InvalidArgument("indices must be rank 1")); + + xla::XlaOp list = ctx->Input(0); + xla::XlaOp indices = ctx->Input(1); + + xla::XlaOp buffer; + OP_REQUIRES_OK(ctx, GetTensorListBuffer(list, &buffer)); + xla::Shape buffer_xla_shape; + OP_REQUIRES_OK(ctx, GetTensorListBufferShape(list, &buffer_xla_shape)); + TensorShape buffer_shape; + OP_REQUIRES_OK(ctx, XLAShapeToTensorShape(buffer_xla_shape, &buffer_shape)); + + xla::XlaOp result; + OP_REQUIRES_OK( + ctx, XlaGather(buffer, buffer_shape, indices, indices_shape, /*axis=*/0, + /*indices_are_nd=*/false, dtype_, indices_type, + ctx->builder(), &result)); + ctx->SetOutput(0, result); + } + + private: + DataType dtype_; + + TF_DISALLOW_COPY_AND_ASSIGN(TensorListGatherOp); +}; + +REGISTER_XLA_OP(Name("TensorListGather"), TensorListGatherOp); + class TensorListStackOp : public XlaOpKernel { public: explicit TensorListStackOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} From 40a45e5d047297b187847ba5e5858c4b83209b57 Mon Sep 17 00:00:00 2001 From: Nupur Garg Date: Tue, 23 Jul 2019 16:55:34 -0700 Subject: [PATCH 0430/3053] Support StatelessIf op in freeze graph. PiperOrigin-RevId: 259640791 --- tensorflow/lite/python/BUILD | 2 -- .../python/framework/convert_to_constants.py | 23 ++++++------ .../framework/convert_to_constants_test.py | 35 ++++++++++++++++++- 3 files changed, 46 insertions(+), 14 deletions(-) diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD index db0edd96aa0..9316da8e94c 100644 --- a/tensorflow/lite/python/BUILD +++ b/tensorflow/lite/python/BUILD @@ -143,8 +143,6 @@ py_test( tags = [ "no_oss", "no_windows", - # TODO(b/138223396) Re-enable after fixing compatibility horizon issue. - "notap", ], deps = [ ":lite", diff --git a/tensorflow/python/framework/convert_to_constants.py b/tensorflow/python/framework/convert_to_constants.py index 88274de8d96..4e2e24ca6e4 100644 --- a/tensorflow/python/framework/convert_to_constants.py +++ b/tensorflow/python/framework/convert_to_constants.py @@ -32,12 +32,12 @@ from tensorflow.python.ops import array_ops from tensorflow.python.training.saver import export_meta_graph -# TODO(nupurgarg): Handle StatelessIf op. -_CONTROL_FLOW_OPS = set(["If", "While"]) +_CONDITIONAL_OPS = set(["If", "StatelessIf"]) +_CONTROL_FLOW_OPS = _CONDITIONAL_OPS.union(set(["While"])) def disable_lower_using_switch_merge(graph_def): - """Set '_lower_using_switch_merge' attributes to False in If and While ops. + """Set '_lower_using_switch_merge' attributes to False. Sets the attribute to False in the NodeDefs in the main graph and the NodeDefs in each function's graph. @@ -202,9 +202,10 @@ def _get_control_flow_function_data(node_defs, tensor_data): Creates a map from function name to a list of types and a list of shapes that correspond with the function arguments. The data is primarily determined from - the corresponding "If" or "While" op. If the argument is a resource variable, - then the type is determined from the type of the data contained within the - Tensor. The shape data is only determined in the case of the "While" op. + the corresponding "If", "StatelessIf", or "While" op. If the argument is a + resource variable, then the type is determined from the type of the data + contained within the Tensor. The shape data is only determined in the case of + the "While" op. `is_also_output_type` is used to identify the "While" bodies that require the output types to be updated at the same time the input types are updated. @@ -238,7 +239,7 @@ def _get_control_flow_function_data(node_defs, tensor_data): } for node in node_defs: - if node.op == "If": + if node.op in _CONDITIONAL_OPS: arg_types = [dtype for dtype in node.attr["Tin"].list.type] for idx in range(len(arg_types)): @@ -297,7 +298,7 @@ def _populate_identity_op(output_node, input_node): def _populate_if_op(output_node, input_node, function_data): - """Updates the type attributes and the function names of the If op. + """Updates the type attributes and the function names of If or StatelessIf. Args: output_node: TensorFlow NodeDef. @@ -422,7 +423,7 @@ def convert_variables_to_constants_v2(func, lower_control_flow=True): converted_input_indices.add(tensor_data[node_name]["index"]) for node in node_defs: - if node.op == "If": + if node.op in _CONDITIONAL_OPS: # Get dtype and data for resource Placeholders. then_func = node.attr["then_branch"].func.name arg_types = function_data[then_func]["types"] @@ -499,7 +500,7 @@ def convert_variables_to_constants_v2(func, lower_control_flow=True): elif input_node.op == "ReadVariableOp": _populate_identity_op(output_node, input_node) # Update the function names and argument types for the conditional ops. - elif input_node.op == "If": + elif input_node.op in _CONDITIONAL_OPS: _populate_if_op(output_node, input_node, function_data) elif input_node.op == "While": _populate_while_op(output_node, input_node, function_data) @@ -550,7 +551,7 @@ def convert_variables_to_constants_v2(func, lower_control_flow=True): if input_node.op == "ReadVariableOp": _populate_identity_op(output_node, input_node) # Update the function names and argument types for the conditional ops. - elif input_node.op == "If": + elif input_node.op in _CONDITIONAL_OPS: _populate_if_op(output_node, input_node, function_data) elif input_node.op == "While": _populate_while_op(output_node, input_node, function_data) diff --git a/tensorflow/python/framework/convert_to_constants_test.py b/tensorflow/python/framework/convert_to_constants_test.py index 4db64572064..f962d5ebe47 100644 --- a/tensorflow/python/framework/convert_to_constants_test.py +++ b/tensorflow/python/framework/convert_to_constants_test.py @@ -31,6 +31,7 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_spec from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops +from tensorflow.python.ops import cond_v2 from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import rnn @@ -288,7 +289,8 @@ class VariablesToConstantsTest(test.TestCase): self._testConvertedFunction(root, fn, output_func, input_data) @test_util.run_v2_only - def testControlFlow(self): + def testIf(self): + """Test whether If op freezes correctly.""" input_data = { "x": constant_op.constant([1., 2.], shape=[1, 2]), "b": constant_op.constant(True) @@ -323,6 +325,37 @@ class VariablesToConstantsTest(test.TestCase): self._testConvertedFunction(root, root.f, output_func, input_data) + @test_util.run_v2_only + def testStatelessIf(self): + """Test whether StatelessIf op freezes correctly.""" + input_data = {"b": constant_op.constant(True)} + + x = constant_op.constant([1., 2.], shape=[1, 2], name="x") + + def true_fn(): + return x + + def false_fn(): + return x + 2 + + @def_function.function( + input_signature=[tensor_spec.TensorSpec(shape=(), dtype=dtypes.bool)]) + def model(b): + return cond_v2.cond_v2(b, true_fn, false_fn) + + root = tracking.AutoTrackable() + root.f = model + input_func = root.f.get_concrete_function() + input_func(**input_data) + + output_func = convert_to_constants.convert_variables_to_constants_v2( + input_func, lower_control_flow=False) + constant_graph_def = output_func.graph.as_graph_def() + self.assertEqual(0, self._getNumVariables(constant_graph_def)) + self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def)) + + self._testConvertedFunction(root, root.f, output_func, input_data) + @test_util.run_v2_only def testStaticRnn(self): input_data = { From 9fb51367a17d4c40cddea6660dcb2b4b373ac404 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 17:08:55 -0700 Subject: [PATCH 0431/3053] Experimental 16 bit floating point support - core headers. This will only compile with compilers/platforms that support either _Float16 or __fp16. PiperOrigin-RevId: 259643324 --- .../lite/experimental/kernels/fp16/BUILD | 17 +++++ .../lite/experimental/kernels/fp16/common.h | 75 +++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 tensorflow/lite/experimental/kernels/fp16/BUILD create mode 100644 tensorflow/lite/experimental/kernels/fp16/common.h diff --git a/tensorflow/lite/experimental/kernels/fp16/BUILD b/tensorflow/lite/experimental/kernels/fp16/BUILD new file mode 100644 index 00000000000..14f9ff42532 --- /dev/null +++ b/tensorflow/lite/experimental/kernels/fp16/BUILD @@ -0,0 +1,17 @@ +# Experimental FP16-on-CPU implementation of a few select layers. + +package( + licenses = ["notice"], # Apache 2.0 +) + +cc_library( + name = "common", + hdrs = [ + "common.h", + ], + deps = [ + "//tensorflow/lite:framework", + "//tensorflow/lite/c:c_api_internal", + "//tensorflow/lite/kernels/internal:tensor", + ], +) diff --git a/tensorflow/lite/experimental/kernels/fp16/common.h b/tensorflow/lite/experimental/kernels/fp16/common.h new file mode 100644 index 00000000000..8b82f1481b4 --- /dev/null +++ b/tensorflow/lite/experimental/kernels/fp16/common.h @@ -0,0 +1,75 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_FP16_COMMON_H_ +#define TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_FP16_COMMON_H_ + +// Experimental half precision floating point type compatible with IEEE 754-2008 +// binary16 format. + +#include "tensorflow/lite/c/c_api_internal.h" +#include "tensorflow/lite/interpreter.h" +#include "tensorflow/lite/kernels/internal/tensor_ctypes.h" + +#if __GNUC__ && ((__clang__ && (__aarch64__ || __arm__)) || \ + (!__cplusplus && __ARM_FP16_FORMAT_IEEE)) +#define TFL_HAS_IEEE_FP16 1 +#endif +#if __GNUC__ && \ + (__clang__ || __ARM_FP16_FORMAT_IEEE || __ARM_FP16_FORMAT_ALTERNATIVE) +#define TFL_HAS_ARM_FP16 1 +#endif + +namespace tflite { + +#if TFL_HAS_IEEE_FP16 +typedef _Float16 tfl_float16_t; +#elif TFL_HAS_ARM_FP16 +typedef __fp16 tfl_float16_t; +#else +// TODO(b/138252484): implement tfl_float16_t using third_party/FP16 +#error "This header requires FP16 support." +#endif + +// Check tfl_float16_t is 'compatible' with the placeholder type. +static_assert(sizeof(tfl_float16_t) == sizeof(TfLiteFloat16), + "Size of real and placeholder FP16 types don't match."); +static_assert(alignof(tfl_float16_t) == alignof(TfLiteFloat16), + "Alignment of real and placeholder FP16 types don't match."); + +// Specialization of typeToTfLiteType with tfl_float16_t. +// Template is declared in interpreter.h +template <> +constexpr TfLiteType typeToTfLiteType() { + return kTfLiteFloat16; +} + +// Specialization of GetTensorData with tfl_float16_t. +// Template is declared in kernels/internal/tensor_ctypes.h +template <> +inline tfl_float16_t* GetTensorData(TfLiteTensor* tensor) { + return tensor != nullptr ? reinterpret_cast(tensor->data.f16) + : nullptr; +} + +template <> +inline const tfl_float16_t* GetTensorData(const TfLiteTensor* tensor) { + return tensor != nullptr + ? reinterpret_cast(tensor->data.f16) + : nullptr; +} + +} // namespace tflite + +#endif // TENSORFLOW_LITE_EXPERIMENTAL_KERNELS_FP16_COMMON_H_ From 91028fbc7aba8a777b2652e1072c14c939f786be Mon Sep 17 00:00:00 2001 From: Kaixi Hou Date: Tue, 23 Jul 2019 17:22:22 -0700 Subject: [PATCH 0432/3053] nhwc plumbing on pool grad op --- tensorflow/core/kernels/pooling_ops_common.cc | 40 ++++++++++++++++++- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc index cd37a2570c9..325277c8658 100644 --- a/tensorflow/core/kernels/pooling_ops_common.cc +++ b/tensorflow/core/kernels/pooling_ops_common.cc @@ -317,6 +317,7 @@ void DnnPoolingGradOp::Compute( return; } +#if CUDNN_VERSION < 7300 /// For now, cudnn does not support NHWC format, so we need to convert it /// to NCHW before calling cudnn. We need to get rid of this once it is done Tensor transformed_input; @@ -382,6 +383,39 @@ void DnnPoolingGradOp::Compute( context->eigen_device(), out_backprop.tensor(), transformed_output_backprop.tensor()); } +#else + Tensor transformed_input; + if (!tensor_in) { + OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum::value, + tensor_in_shape, + &transformed_input)); + } else { + transformed_input = *tensor_in; + } + Tensor transformed_output; + if (!tensor_out) { + OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum::value, + out_backprop.shape(), + &transformed_output)); + } else { + transformed_output = *tensor_out; + } + Tensor transformed_input_backprop = *input_backprop; + Tensor transformed_output_backprop = out_backprop; + se::dnn::DataLayout data_layout; + switch (data_format) { + case FORMAT_NHWC: + data_layout = se::dnn::DataLayout::kBatchYXDepth; + break; + case FORMAT_NCHW: + data_layout = se::dnn::DataLayout::kBatchDepthYX; + break; + default: + OP_REQUIRES(context, false, + errors::InvalidArgument("Unsupported format: ", + ToString(data_format))); + } +#endif // CUDNN_VERSION < 7300 /// Get ready to call cudnn se::dnn::PoolingDescriptor pooling_desc; @@ -399,14 +433,14 @@ void DnnPoolingGradOp::Compute( .set_height(params.out_height) .set_width(params.out_width) .set_feature_map_count(params.depth) - .set_layout(se::dnn::DataLayout::kBatchDepthYX); + .set_layout(data_layout); se::dnn::BatchDescriptor orig_input_desc; orig_input_desc.set_count(params.tensor_in_batch) .set_height(params.tensor_in_rows) .set_width(params.tensor_in_cols) .set_feature_map_count(params.depth) - .set_layout(se::dnn::DataLayout::kBatchDepthYX); + .set_layout(data_layout); auto orig_output_data = AsDeviceMemory(transformed_output.template flat().data(), @@ -449,6 +483,7 @@ void DnnPoolingGradOp::Compute( OP_REQUIRES(context, status, errors::Internal("dnn PoolBackward launch failed")); +#if CUDNN_VERSION < 7300 if (data_format == FORMAT_NHWC) { /// Transform the output data from NCHW back to NHWC. auto toConstTensor = [](const Tensor& x) -> const Tensor { return x; }; @@ -457,6 +492,7 @@ void DnnPoolingGradOp::Compute( toConstTensor(transformed_input_backprop).template tensor(), input_backprop->tensor()); } +#endif // CUDNN_VERSION < 7300 } #define DEFINE_DNN_OPS(T) \ From 3379e8b3ee4f8c17e7a6115f14e62c5c6a41f7d3 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 17:09:12 -0700 Subject: [PATCH 0433/3053] Update ops-related pbtxt files. PiperOrigin-RevId: 259643377 --- .../core/ops/compat/ops_history.v1.pbtxt | 18 ++++++++++++++++++ tensorflow/core/ops/ops.pbtxt | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt index bbcb06f32ee..d163bf58d62 100644 --- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt +++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt @@ -12320,6 +12320,24 @@ op { } is_stateful: true } +op { + name: "BoostedTreesFlushQuantileSummaries" + input_arg { + name: "quantile_stream_resource_handle" + type: DT_RESOURCE + } + output_arg { + name: "summaries" + type: DT_FLOAT + number_attr: "num_features" + } + attr { + name: "num_features" + type: "int" + has_minimum: true + } + is_stateful: true +} op { name: "BoostedTreesGetEnsembleStates" input_arg { diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index ba9658c5084..b119eee1530 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -5075,6 +5075,24 @@ op { } is_stateful: true } +op { + name: "BoostedTreesFlushQuantileSummaries" + input_arg { + name: "quantile_stream_resource_handle" + type: DT_RESOURCE + } + output_arg { + name: "summaries" + type: DT_FLOAT + number_attr: "num_features" + } + attr { + name: "num_features" + type: "int" + has_minimum: true + } + is_stateful: true +} op { name: "BoostedTreesGetEnsembleStates" input_arg { From 3dbe6083efa7b05b3b4d5ff2d8b4a3b45d56ce42 Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Tue, 23 Jul 2019 17:35:57 -0700 Subject: [PATCH 0434/3053] TRT minor improvements & correctionx --- .../tf2tensorrt/convert/convert_graph.cc | 1 - .../tf2tensorrt/convert/convert_graph.h | 4 ++++ .../tf2tensorrt/kernels/trt_engine_op.cc | 4 ---- .../tf2tensorrt/kernels/trt_engine_op_test.cc | 2 ++ .../tf2tensorrt/utils/funcdef_to_graphdef.cc | 6 ++++-- .../test/tf_trt_integration_test_base.py | 7 +++---- .../compiler/tensorrt/trt_convert_test.py | 17 ++++------------- 7 files changed, 17 insertions(+), 24 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index 15096961632..a6ebebe5a60 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -66,7 +66,6 @@ using absl::StrCat; namespace { - Status BuildNodeMap(const Graph& graph, std::unordered_map* node_map) { for (auto* node : graph.op_nodes()) { diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h index 62af1af338f..476cedaa180 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h @@ -56,9 +56,13 @@ Status ConvertAfterShapes(const ConversionParams& params); std::pair GetDeviceAndAllocator(const ConversionParams& params, const EngineInfo& engine); +// Method to replace Placeholder and identity nodes with Arg and Retval. +// graph is the full graph, while segment_graph is only the segment. Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, Graph* segment_graph); +// Method that registers the segment graph to a function library. +// graph is the full graph, while segment_graph is only the segment. Status RegisterModifiedGraphToFunctionLibrary(Graph* segment_graph, Graph* graph, FunctionDefLibrary fdeflib, const string& engine_name); diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc index ca23f84aead..353e787dd75 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc @@ -193,7 +193,6 @@ void* GetTensorAddress(const Tensor* tensor_ptr) { Status TRTEngineOp::ConstructFunctionHandle(FunctionLibraryRuntime* lib, const string& device_name) { VLOG(1) << "Constructing function handle"; - // auto lib = ctx->function_library(); if (lib == nullptr) { return errors::Internal("Context function library is null"); } @@ -254,9 +253,6 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx, AsyncHelper* helper) { - OP_REQUIRES_ASYNC(ctx, !funcdef_name_.empty(), - errors::Internal("Fallback path is disabled, for ", name()), - *helper); std::vector inputs; std::vector* outputs = new std::vector(); if (native_func_ == kInvalidHandle) { diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc index 08330b58bd7..b5056fa5b91 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc @@ -51,6 +51,7 @@ class TRTEngineOpTestBase : public OpsTestBase { // Create the GPU device. std::unique_ptr device( DeviceFactory::NewDevice("GPU", {}, "/job:worker/replica:0/task:0")); + // Create simple TF graph. Scope s = Scope::NewRootScope(); auto feed = ops::Placeholder(s.WithOpName("TensorRTInputPH_0"), dtype, @@ -71,6 +72,7 @@ class TRTEngineOpTestBase : public OpsTestBase { PartialTensorShape shape({-1, -1}); + // Create the op. OpsTestBase::SetDevice(DEVICE_GPU, std::move(device)); TF_ASSERT_OK(NodeDefBuilder("myop", "TRTEngineOp") .Input(FakeInput(1, dtype)) diff --git a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc index d17f6efc1fc..a9810bbc011 100644 --- a/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc +++ b/tensorflow/compiler/tf2tensorrt/utils/funcdef_to_graphdef.cc @@ -131,8 +131,10 @@ Status FunctionDefToGraphDef(FunctionLibraryRuntime::Handle handle, ToGraphDefWithIOPrefix(graph.release(), graph_def); - for (const auto node_def : graph_def->node()) { - string node_name = node_def.name(); + if VLOG_IS_ON(2) { + for (const auto node_def : graph_def->node()) { + VLOG(2) << "Node name after FunctionDefToGraphDef: " << node_def.name(); + } } return Status::OK(); diff --git a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py index 6627c3788a4..6971f735514 100644 --- a/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py +++ b/tensorflow/python/compiler/tensorrt/test/tf_trt_integration_test_base.py @@ -558,10 +558,9 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase): segment_funcdef_name = node.attr["segment_funcdef_name"].s function_name = node.name + "_native_segment" is_dynamic_engine = not node.attr["static_engine"].b - if IsQuantizationWithCalibration(run_params) or is_dynamic_engine: - self.assertNotEmpty(segment_funcdef_name, node.name) - self.assertIn(function_name, functions) - else: + self.assertNotEmpty(segment_funcdef_name, node.name) + self.assertIn(function_name, functions) + if not IsQuantizationWithCalibration and not is_dynamic_engine: self.assertTrue(len(node.attr["serialized_segment"].s), node.name) self.assertIn(node.name, expected_engines) self.assertEqual( diff --git a/tensorflow/python/compiler/tensorrt/trt_convert_test.py b/tensorflow/python/compiler/tensorrt/trt_convert_test.py index b8376a5ca65..41c2c28e21a 100644 --- a/tensorflow/python/compiler/tensorrt/trt_convert_test.py +++ b/tensorflow/python/compiler/tensorrt/trt_convert_test.py @@ -442,14 +442,9 @@ class TrtConvertTest(test_util.TensorFlowTestCase): sess, batch_size, expect_engine_is_run=True): - try: - result = sess.run( - "output:0", feed_dict={"input:0": [[[1.0]]] * batch_size}) - self.assertAllEqual([[[4.0]]] * batch_size, result) - except errors.OpError as e: - # This should happen only when fallback path is disabled and TRT engine - # fails to run. - self.assertIn("Fallback path is disabled, for TRTEngineOp_0", str(e)) + result = sess.run( + "output:0", feed_dict={"input:0": [[[1.0]]] * batch_size}) + self.assertAllEqual([[[4.0]]] * batch_size, result) @test_util.deprecated_graph_mode_only def testTrtGraphConverter_MinimumSegmentSize(self): @@ -554,11 +549,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase): expect_engine_is_run=False) @test_util.deprecated_graph_mode_only - def testTrtGraphConverter_StaticOp_NoFallback(self): - self._TestStaticOp() - - @test_util.deprecated_graph_mode_only - def testTrtGraphConverter_StaticOp_WithFallback(self): + def testTrtGraphConverter_StaticOp(self): self._TestStaticOp() From e1de70abd79a91cfe46e6396bf83fdc45e10f224 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Tue, 23 Jul 2019 17:15:30 -0700 Subject: [PATCH 0435/3053] Automated rollback of commit 07a6725462ac030eddfd7fb9bed8c299482d0f57 PiperOrigin-RevId: 259644296 --- .../graph-custom-operation.pbtxt | 2169 ++++++++++++++++- 1 file changed, 2150 insertions(+), 19 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt index 74984c35480..82146716fff 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt @@ -1,8 +1,209 @@ # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s node { - name: "Constant" + name: "Placeholder" + op: "Placeholder" + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "shape" + value { + shape { + unknown_rank: true + } + } + } +} +node { + name: "Placeholder_1" + op: "Placeholder" + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "shape" + value { + shape { + unknown_rank: true + } + } + } +} +node { + name: "input0" + op: "TPUReplicatedInput" + input: "Placeholder" + attr { + key: "N" + value { + i: 1 + } + } + attr { + key: "T" + value { + type: DT_FLOAT + } + } +} +node { + name: "input1" + op: "TPUReplicatedInput" + input: "Placeholder_1" + attr { + key: "N" + value { + i: 1 + } + } + attr { + key: "T" + value { + type: DT_FLOAT + } + } +} +node { + name: "cluster/pivot" + op: "NoOp" +} +node { + name: "TPUReplicateMetadata" + op: "TPUReplicateMetadata" + input: "^cluster/pivot" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "computation_shape" + value { + list { + } + } + } + attr { + key: "device_assignment" + value { + list { + } + } + } + attr { + key: "host_compute_core" + value { + list { + } + } + } + attr { + key: "num_cores_per_replica" + value { + i: 1 + } + } + attr { + key: "num_replicas" + value { + i: 1 + } + } + attr { + key: "topology" + value { + s: "" + } + } + attr { + key: "use_tpu" + value { + b: true + } + } +} +node { + name: "replicated_input_0" + op: "Identity" + input: "input0" + input: "^TPUReplicateMetadata" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "replicated_input_1" + op: "Identity" + input: "input1" + input: "^TPUReplicateMetadata" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/maximum_iterations" op: "Const" + input: "^TPUReplicateMetadata" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "dtype" + value { + type: DT_INT32 + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_INT32 + tensor_shape { + } + int_val: 10 + } + } + } +} +node { + name: "while/iteration_counter" + op: "Const" + input: "^TPUReplicateMetadata" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } attr { key: "dtype" value { @@ -22,38 +223,1968 @@ node { } } node { - name: "_tf.foo" - op: "foo" - input: "Constant" + name: "while/Enter" + op: "Enter" + input: "while/iteration_counter" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "frame_name" + value { + s: "while/while_context" + } + } + attr { + key: "is_constant" + value { + b: false + } + } + attr { + key: "parallel_iterations" + value { + i: 10 + } + } +} +node { + name: "while/Enter_1" + op: "Enter" + input: "replicated_input_0" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "frame_name" + value { + s: "while/while_context" + } + } + attr { + key: "is_constant" + value { + b: false + } + } + attr { + key: "parallel_iterations" + value { + i: 10 + } + } +} +node { + name: "while/Enter_2" + op: "Enter" + input: "replicated_input_1" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "frame_name" + value { + s: "while/while_context" + } + } + attr { + key: "is_constant" + value { + b: false + } + } + attr { + key: "parallel_iterations" + value { + i: 10 + } + } +} +node { + name: "while/Merge" + op: "Merge" + input: "while/Enter" + input: "while/NextIteration" + attr { + key: "N" + value { + i: 2 + } + } + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/Merge_1" + op: "Merge" + input: "while/Enter_1" + input: "while/NextIteration_1" + attr { + key: "N" + value { + i: 2 + } + } + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/Merge_2" + op: "Merge" + input: "while/Enter_2" + input: "while/NextIteration_2" + attr { + key: "N" + value { + i: 2 + } + } + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/Less/Enter" + op: "Enter" + input: "while/maximum_iterations" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "frame_name" + value { + s: "while/while_context" + } + } + attr { + key: "is_constant" + value { + b: true + } + } + attr { + key: "parallel_iterations" + value { + i: 10 + } + } +} +node { + name: "while/Less" + op: "Less" + input: "while/Merge" + input: "while/Less/Enter" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/less_than_5_If8q4vKg9jA" + op: "less_than_5_If8q4vKg9jA" + input: "while/Merge_1" + input: "^while/Merge" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/LogicalAnd" + op: "LogicalAnd" + input: "while/Less" + input: "while/less_than_5_If8q4vKg9jA" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/LoopCond" + op: "LoopCond" + input: "while/LogicalAnd" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/Switch" + op: "Switch" + input: "while/Merge" + input: "while/LoopCond" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_class" + value { + list { + s: "loc:@while/Merge" + } + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/Switch_1" + op: "Switch" + input: "while/Merge_1" + input: "while/LoopCond" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_class" + value { + list { + s: "loc:@while/Merge_1" + } + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/Switch_2" + op: "Switch" + input: "while/Merge_2" + input: "while/LoopCond" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_class" + value { + list { + s: "loc:@while/Merge_2" + } + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/Identity" + op: "Identity" + input: "while/Switch:1" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/Identity_1" + op: "Identity" + input: "while/Switch_1:1" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/Identity_2" + op: "Identity" + input: "while/Switch_2:1" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/add/y" + op: "Const" + input: "^while/Identity" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "dtype" + value { + type: DT_INT32 + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_INT32 + tensor_shape { + } + int_val: 1 + } + } + } +} +node { + name: "while/add" + op: "Add" + input: "while/Identity" + input: "while/add/y" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/add_1/y" + op: "Const" + input: "^while/Identity" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_FLOAT + tensor_shape { + } + float_val: 1 + } + } + } +} +node { + name: "while/add_1" + op: "Add" + input: "while/Identity_1" + input: "while/add_1/y" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/mul_2_Da30D05wlPU" + op: "mul_2_Da30D05wlPU" + input: "while/Identity_1" + input: "while/Identity_2" + input: "^while/Identity" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/NextIteration" + op: "NextIteration" + input: "while/add" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/NextIteration_1" + op: "NextIteration" + input: "while/add_1" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/NextIteration_2" + op: "NextIteration" + input: "while/mul_2_Da30D05wlPU" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/Exit" + op: "Exit" + input: "while/Switch" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/Exit_1" + op: "Exit" + input: "while/Switch_1" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "while/Exit_2" + op: "Exit" + input: "while/Switch_2" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/Shape" + op: "Shape" + input: "while/Exit_2" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "out_type" + value { + type: DT_INT32 + } + } +} +node { + name: "gradients/grad_ys_0" + op: "Const" + input: "^TPUReplicateMetadata" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_FLOAT + tensor_shape { + } + float_val: 1 + } + } + } +} +node { + name: "gradients/Fill" + op: "Fill" + input: "gradients/Shape" + input: "gradients/grad_ys_0" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "index_type" + value { + type: DT_INT32 + } + } +} +node { + name: "gradients/f_count" + op: "Const" + input: "^TPUReplicateMetadata" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "dtype" + value { + type: DT_INT32 + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_INT32 + tensor_shape { + } + int_val: 0 + } + } + } +} +node { + name: "gradients/f_count_1" + op: "Enter" + input: "gradients/f_count" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "frame_name" + value { + s: "while/while_context" + } + } + attr { + key: "is_constant" + value { + b: false + } + } + attr { + key: "parallel_iterations" + value { + i: 10 + } + } +} +node { + name: "gradients/Merge" + op: "Merge" + input: "gradients/f_count_1" + input: "gradients/NextIteration" + attr { + key: "N" + value { + i: 2 + } + } + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/Switch" + op: "Switch" + input: "gradients/Merge" + input: "while/LoopCond" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/Add/y" + op: "Const" + input: "^while/Identity" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "dtype" + value { + type: DT_INT32 + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_INT32 + tensor_shape { + } + int_val: 1 + } + } + } +} +node { + name: "gradients/Add" + op: "Add" + input: "gradients/Switch:1" + input: "gradients/Add/y" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/f_count_2" + op: "Exit" + input: "gradients/Switch" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/b_count" + op: "Const" + input: "^TPUReplicateMetadata" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "dtype" + value { + type: DT_INT32 + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_INT32 + tensor_shape { + } + int_val: 1 + } + } + } +} +node { + name: "gradients/b_count_1" + op: "Enter" + input: "gradients/f_count_2" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "frame_name" + value { + s: "gradients/while/while_context" + } + } + attr { + key: "is_constant" + value { + b: false + } + } + attr { + key: "parallel_iterations" + value { + i: 10 + } + } +} +node { + name: "gradients/Merge_1" + op: "Merge" + input: "gradients/b_count_1" + input: "gradients/NextIteration_1" + attr { + key: "N" + value { + i: 2 + } + } + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/GreaterEqual/Enter" + op: "Enter" + input: "gradients/b_count" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "frame_name" + value { + s: "gradients/while/while_context" + } + } + attr { + key: "is_constant" + value { + b: true + } + } + attr { + key: "parallel_iterations" + value { + i: 10 + } + } +} +node { + name: "gradients/GreaterEqual" + op: "GreaterEqual" + input: "gradients/Merge_1" + input: "gradients/GreaterEqual/Enter" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/b_count_2" + op: "LoopCond" + input: "gradients/GreaterEqual" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/Switch_1" + op: "Switch" + input: "gradients/Merge_1" + input: "gradients/b_count_2" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/Sub" + op: "Sub" + input: "gradients/Switch_1:1" + input: "gradients/GreaterEqual/Enter" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/b_count_3" + op: "Exit" + input: "gradients/Switch_1" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/zeros_like" + op: "ZerosLike" + input: "while/Exit_1" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/while/Exit_2_grad/b_exit" + op: "Enter" + input: "gradients/Fill" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "frame_name" + value { + s: "gradients/while/while_context" + } + } + attr { + key: "is_constant" + value { + b: false + } + } + attr { + key: "parallel_iterations" + value { + i: 10 + } + } +} +node { + name: "gradients/while/Exit_1_grad/b_exit" + op: "Enter" + input: "gradients/zeros_like" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "frame_name" + value { + s: "gradients/while/while_context" + } + } + attr { + key: "is_constant" + value { + b: false + } + } + attr { + key: "parallel_iterations" + value { + i: 10 + } + } +} +node { + name: "gradients/while/Switch_2_grad/b_switch" + op: "Merge" + input: "gradients/while/Exit_2_grad/b_exit" + input: "gradients/while/Switch_2_grad_1/NextIteration" + attr { + key: "N" + value { + i: 2 + } + } + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/while/Merge_2_grad/Switch" + op: "Switch" + input: "gradients/while/Switch_2_grad/b_switch" + input: "gradients/b_count_2" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_class" + value { + list { + s: "loc:@gradients/while/Switch_2_grad/b_switch" + } + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/while/Enter_2_grad/Exit" + op: "Exit" + input: "gradients/while/Merge_2_grad/Switch" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const" + op: "Const" + input: "^cluster/pivot" + attr { + key: "_class" + value { + list { + s: "loc:@while/Identity_1" + } + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "dtype" + value { + type: DT_INT32 + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_INT32 + tensor_shape { + } + int_val: 1 + } + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul" + op: "Mul" + input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const" + input: "while/maximum_iterations" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_class" + value { + list { + s: "loc:@while/Identity_1" + } + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc" + op: "StackV2" + input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul" + attr { + key: "_class" + value { + list { + s: "loc:@while/Identity_1" + } + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "elem_type" + value { + type: DT_FLOAT + } + } + attr { + key: "stack_name" + value { + s: "" + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter" + op: "Enter" + input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc" + attr { + key: "T" + value { + type: DT_RESOURCE + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "frame_name" + value { + s: "while/while_context" + } + } + attr { + key: "is_constant" + value { + b: true + } + } + attr { + key: "parallel_iterations" + value { + i: 10 + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2" + op: "StackPushV2" + input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter" + input: "while/Identity_1" + input: "^gradients/Add" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "swap_memory" + value { + b: false + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2/Enter" + op: "Enter" + input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc" + attr { + key: "T" + value { + type: DT_RESOURCE + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "frame_name" + value { + s: "gradients/while/while_context" + } + } + attr { + key: "is_constant" + value { + b: true + } + } + attr { + key: "parallel_iterations" + value { + i: 10 + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2" + op: "StackPopV2" + input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2/Enter" + input: "^gradients/Sub" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "elem_type" + value { + type: DT_FLOAT + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const_1" + op: "Const" + input: "^cluster/pivot" + attr { + key: "_class" + value { + list { + s: "loc:@while/Identity_2" + } + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "dtype" + value { + type: DT_INT32 + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_INT32 + tensor_shape { + } + int_val: 1 + } + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul_1" + op: "Mul" + input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const_1" + input: "while/maximum_iterations" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_class" + value { + list { + s: "loc:@while/Identity_2" + } + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc_1" + op: "StackV2" + input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul_1" + attr { + key: "_class" + value { + list { + s: "loc:@while/Identity_2" + } + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "elem_type" + value { + type: DT_FLOAT + } + } + attr { + key: "stack_name" + value { + s: "" + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter_1" + op: "Enter" + input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc_1" + attr { + key: "T" + value { + type: DT_RESOURCE + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "frame_name" + value { + s: "while/while_context" + } + } + attr { + key: "is_constant" + value { + b: true + } + } + attr { + key: "parallel_iterations" + value { + i: 10 + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2_1" + op: "StackPushV2" + input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter_1" + input: "while/Identity_2" + input: "^gradients/Add" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "swap_memory" + value { + b: false + } + } +} +node { + name: "gradients/NextIteration" + op: "NextIteration" + input: "gradients/Add" + input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2" + input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2_1" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1/Enter" + op: "Enter" + input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc_1" + attr { + key: "T" + value { + type: DT_RESOURCE + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "frame_name" + value { + s: "gradients/while/while_context" + } + } + attr { + key: "is_constant" + value { + b: true + } + } + attr { + key: "parallel_iterations" + value { + i: 10 + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1" + op: "StackPopV2" + input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1/Enter" + input: "^gradients/Sub" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "elem_type" + value { + type: DT_FLOAT + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient" + op: "SymbolicGradient" + input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2" + input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1" + input: "gradients/while/Merge_2_grad/Switch:1" + input: "^gradients/Sub" + attr { + key: "Tin" + value { + list { + type: DT_FLOAT + type: DT_FLOAT + type: DT_FLOAT + } + } + } + attr { + key: "Tout" + value { + list { + type: DT_FLOAT + type: DT_FLOAT + } + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + attr { + key: "f" + value { + func { + name: "mul_2_Da30D05wlPU" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } + } + } + } +} +node { + name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync" + op: "ControlTrigger" + input: "^cluster/pivot" + input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2" + input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/NextIteration_1" + op: "NextIteration" + input: "gradients/Sub" + input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync" + attr { + key: "T" + value { + type: DT_INT32 + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "gradients/while/Switch_2_grad_1/NextIteration" + op: "NextIteration" + input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient:1" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "NoOp" + op: "NoOp" + input: "^cluster/pivot" + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "Identity" + op: "Identity" + input: "gradients/while/Enter_2_grad/Exit" + device: "/device:TPU_REPLICATED_CORE:0" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_tpu_replicate" + value { + s: "cluster" + } + } +} +node { + name: "output0" + op: "TPUReplicatedOutput" + input: "Identity" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "num_replicas" + value { + i: 1 + } + } +} +node { + name: "TPUCompilationResult" + op: "TPUCompilationResult" + input: "^TPUReplicateMetadata" + attr { + key: "_tpu_compilation_status" + value { + s: "cluster" + } + } +} +node { + name: "output_0_shard_0" + op: "Identity" + input: "output0" + input: "^NoOp" + attr { + key: "T" + value { + type: DT_FLOAT + } + } +} +node { + name: "ConfigureDistributedTPU" + op: "ConfigureDistributedTPU" + device: "/device:TPU_SYSTEM:0" + attr { + key: "embedding_config" + value { + s: "" + } + } + attr { + key: "is_global_init" + value { + b: false + } + } + attr { + key: "tpu_embedding_config" + value { + s: "" + } + } } library { function { signature { - name: "foo" + name: "mul_2_Da30D05wlPU" input_arg { - name: "arg" - type: DT_INT32 + name: "mul_2_da30d05wlpu" + type: DT_FLOAT + } + input_arg { + name: "mul_2_da30d05wlpu1" + type: DT_FLOAT } output_arg { - name: "return_value" - type: DT_INT32 + name: "mul_2_da30d05wlpu2" + type: DT_FLOAT + } + } + node_def { + name: "mul/y" + op: "Const" + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_FLOAT + tensor_shape { + dim { + size: 1 + } + dim { + size: 1 + } + } + float_val: 2 + } + } + } + } + node_def { + name: "mul_0" + op: "Mul" + input: "mul_2_da30d05wlpu1" + input: "mul/y:output:0" + attr { + key: "T" + value { + type: DT_FLOAT + } } } ret { - key: "return_value" - value: "arg" + key: "mul_2_da30d05wlpu2" + value: "mul_0:z:0" + } + attr { + key: "_noinline" + value { + b: true + } + } + } + function { + signature { + name: "less_than_5_If8q4vKg9jA" + input_arg { + name: "less_than_5_if8q4vkg9ja" + type: DT_FLOAT + } + output_arg { + name: "less_than_5_if8q4vkg9ja1" + type: DT_BOOL + } + } + node_def { + name: "Less/y" + op: "Const" + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_FLOAT + tensor_shape { + } + float_val: 5 + } + } + } + } + node_def { + name: "Less" + op: "Less" + input: "less_than_5_if8q4vkg9ja" + input: "Less/y:output:0" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + } + ret { + key: "less_than_5_if8q4vkg9ja1" + value: "Less:z:0" + } + attr { + key: "_noinline" + value { + b: true + } } } } versions { - producer: 62 + producer: 27 min_consumer: 12 } - -# Verify that we can import a custom operation that maps to a function and that -# the names are matching between the function definition and the uses / call -# site (a numerical suffix may be appended). - -# CHECK: "tf.foo0" -# CHECK: func @foo0 +# CHECK: func @main() { +# CHECK: %30:2 = "_tf.less_than_5_If8q4vKg9jA0"(%23#0, %29#2) {_tpu_replicate = "cluster", device = "", name = "while/less_than_5_If8q4vKg9jA"} : (tensor<*xf32>, !_tf.control) -> (tensor<*xi1>, !_tf.control) +# CHECK: %73:2 = "_tf.mul_2_Da30D05wlPU0"(%58#0, %72#0, %47#1) {_tpu_replicate = "cluster", device = "", name = "while/mul_2_Da30D05wlPU"} : (tensor<*xf32>, tensor<*xf32>, !_tf.control) -> (tensor<*xf32>, !_tf.control) +# CHECK: return +# CHECK-NEXT: } +# CHECK: func @less_than_5_If8q4vKg9jA0(%arg0: tensor<*xf32>) -> tensor<*xi1> +# CHECK-NEXT: attributes {tf._noinline = true} { +# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Less/y", value = dense<5.000000e+00> : tensor} : () -> (tensor, !_tf.control) +# CHECK-NEXT: %1:2 = "_tf.Less"(%arg0, %0#0) {T = "tfdtype$DT_FLOAT", device = "", name = "Less"} : (tensor<*xf32>, tensor) -> (tensor<*xi1>, !_tf.control) +# CHECK-NEXT: return %1#0 : tensor<*xi1> +# CHECK-NEXT: } +# CHECK: func @mul_2_Da30D05wlPU0(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> +# CHECK-NEXT: attributes {tf._noinline = true} { +# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "mul/y", value = dense<2.000000e+00> : tensor<1x1xf32>} : () -> (tensor<1x1xf32>, !_tf.control) +# CHECK-NEXT: %1:2 = "_tf.Mul"(%arg1, %0#0) {T = "tfdtype$DT_FLOAT", device = "", name = "mul_0"} : (tensor<*xf32>, tensor<1x1xf32>) -> (tensor<*xf32>, !_tf.control) +# CHECK-NEXT: return %1#0 : tensor<*xf32> +# CHECK-NEXT: } From 7e0ac8e44001a3fe2d1af5753d19dc3acb9209c2 Mon Sep 17 00:00:00 2001 From: Saurabh Saxena Date: Tue, 23 Jul 2019 17:19:03 -0700 Subject: [PATCH 0436/3053] Fix device to device copy of nested variants. PiperOrigin-RevId: 259644840 --- tensorflow/core/common_runtime/copy_tensor.cc | 39 ++++++++++++------- .../python/kernel_tests/list_ops_test.py | 25 ++++++++++++ 2 files changed, 49 insertions(+), 15 deletions(-) diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc index 38f8fb96b42..844dbc2a198 100644 --- a/tensorflow/core/common_runtime/copy_tensor.cc +++ b/tensorflow/core/common_runtime/copy_tensor.cc @@ -136,28 +136,37 @@ void CopyDeviceToDevice(CopyTensor::CopyFunction copy_function, status_cb->Unref(); }; auto copier = std::bind( - [copy_function, src, dst, src_alloc_attr, dst_alloc_attr, + [copy_function, cpu_allocator, src, dst, src_alloc_attr, dst_alloc_attr, recv_dev_context, send_dev_context, out_allocator, status_cb, dev_to_dev_stream_index](StatusCallback wrapped_done_, // Begin unbound arguments const Tensor& from, Tensor* to) { - if (!DMAHelper::CanUseDMA(&from)) { - Status err = errors::InvalidArgument( - "During Variant Device->Device Copy: " - "non-DMA-copy attempted of tensor type: ", - DataTypeString(from.dtype())); - status_cb->UpdateStatus(err); - return err; - } - if (status_cb->ok()) { + if (from.dtype() == DT_VARIANT) { status_cb->Ref(); - *to = Tensor(out_allocator, from.dtype(), from.shape()); - copy_function(send_dev_context, recv_dev_context, src, dst, - src_alloc_attr, dst_alloc_attr, &from, to, - dev_to_dev_stream_index, std::move(wrapped_done_)); + CopyDeviceToDevice(copy_function, cpu_allocator, out_allocator, + send_dev_context, recv_dev_context, src, dst, + src_alloc_attr, dst_alloc_attr, &from, to, + dev_to_dev_stream_index, wrapped_done_); return Status::OK(); } else { - return status_cb->status(); + if (!DMAHelper::CanUseDMA(&from)) { + Status err = errors::InvalidArgument( + "During Variant Device->Device Copy: ", src->name(), " to ", + dst->name(), " non-DMA-copy attempted of tensor type: ", + DataTypeString(from.dtype())); + status_cb->UpdateStatus(err); + return err; + } + if (status_cb->ok()) { + status_cb->Ref(); + *to = Tensor(out_allocator, from.dtype(), from.shape()); + copy_function(send_dev_context, recv_dev_context, src, dst, + src_alloc_attr, dst_alloc_attr, &from, to, + dev_to_dev_stream_index, std::move(wrapped_done_)); + return Status::OK(); + } else { + return status_cb->status(); + } } }, std::move(wrapped_done), std::placeholders::_1, std::placeholders::_2); diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py index 052e012187c..edd1f6df7c3 100644 --- a/tensorflow/python/kernel_tests/list_ops_test.py +++ b/tensorflow/python/kernel_tests/list_ops_test.py @@ -1582,6 +1582,31 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase): tensor_list, 0, element_dtype=dtypes.float32) self.assertAllEqual(element.shape.as_list(), []) + @test_util.run_gpu_only + def testNestedListDevicetoDeviceCopy(self): + if context.num_gpus() < 2: + self.skipTest("Need at least 2 GPUs for this test, found %d" % + context.num_gpus()) + with ops.device("gpu:0"): + t = constant_op.constant([1.0, 2.0, 3.0]) + inner_l = list_ops.tensor_list_from_tensor(t, element_shape=[]) + outer_l = list_ops.empty_tensor_list( + element_dtype=dtypes.variant, element_shape=[]) + outer_l = list_ops.tensor_list_push_back(outer_l, inner_l) + + # Stress test. + for _ in range(1024): + with ops.device("gpu:1"): + outer_l = array_ops.identity(outer_l) + with ops.device("gpu:0"): + outer_l = array_ops.identity(outer_l) + + with ops.device("gpu:1"): + _, inner_l = list_ops.tensor_list_pop_back( + outer_l, element_dtype=dtypes.variant) + t = list_ops.tensor_list_stack(inner_l, element_dtype=dtypes.float32) + self.assertAllEqual(t, [1.0, 2.0, 3.0]) + if __name__ == "__main__": test.main() From 27dc5f59a2faf3033a68aec5fa6ec17760617a56 Mon Sep 17 00:00:00 2001 From: Yilei Yang Date: Tue, 23 Jul 2019 17:26:09 -0700 Subject: [PATCH 0437/3053] Explicitly set python_version to PY2. PiperOrigin-RevId: 259645895 --- tensorflow/lite/build_def.bzl | 1 + tensorflow/python/tools/api/generator/api_gen.bzl | 1 + 2 files changed, 2 insertions(+) diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl index 2311359308a..cb98f69ec47 100644 --- a/tensorflow/lite/build_def.bzl +++ b/tensorflow/lite/build_def.bzl @@ -498,6 +498,7 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags): ] + args, data = data, srcs_version = "PY2AND3", + python_version = "PY2", tags = [ "no_oss", "no_windows", diff --git a/tensorflow/python/tools/api/generator/api_gen.bzl b/tensorflow/python/tools/api/generator/api_gen.bzl index 5e64cc64d24..234addaf782 100644 --- a/tensorflow/python/tools/api/generator/api_gen.bzl +++ b/tensorflow/python/tools/api/generator/api_gen.bzl @@ -67,6 +67,7 @@ def gen_api_init_files( name = api_gen_binary_target, srcs = ["//tensorflow/python/tools/api/generator:create_python_api.py"], main = "//tensorflow/python/tools/api/generator:create_python_api.py", + python_version = "PY2", srcs_version = "PY2AND3", visibility = ["//visibility:public"], deps = package_deps + [ From e0813e20f610195ca19596bd28936cae64af321b Mon Sep 17 00:00:00 2001 From: Jonathan Hseu Date: Tue, 23 Jul 2019 17:27:11 -0700 Subject: [PATCH 0438/3053] Switch backend.variable() to create a normal TF variable. This changes its behavior under tf.distribute.Strategy, where it'll now create the appropriate distributed variable. PiperOrigin-RevId: 259646025 --- tensorflow/python/keras/backend.py | 3 +-- ...as_stateful_lstm_model_correctness_test.py | 7 ++++--- .../keras/engine/training_distributed.py | 19 ++++++++++--------- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py index c7ebb4b2524..186b4f24639 100644 --- a/tensorflow/python/keras/backend.py +++ b/tensorflow/python/keras/backend.py @@ -64,7 +64,6 @@ from tensorflow.python.ops import map_fn as map_fn_lib from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn from tensorflow.python.ops import random_ops -from tensorflow.python.ops import resource_variable_ops from tensorflow.python.ops import sparse_ops from tensorflow.python.ops import state_ops from tensorflow.python.ops import tensor_array_grad # pylint: disable=unused-import @@ -775,7 +774,7 @@ def variable(value, dtype=None, name=None, constraint=None): indices=indices, values=sparse_coo.data, dense_shape=sparse_coo.shape) v._keras_shape = sparse_coo.shape return v - v = resource_variable_ops.ResourceVariable( + v = variables_module.Variable( value, dtype=dtypes_module.as_dtype(dtype), name=name, diff --git a/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py b/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py index 3a6d5cc30a2..4802c8d07d7 100644 --- a/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py +++ b/tensorflow/python/keras/distribute/keras_stateful_lstm_model_correctness_test.py @@ -82,10 +82,11 @@ class DistributionStrategyStatefulLstmModelCorrectnessTest( metrics=['sparse_categorical_accuracy']) return model + # TODO(jhseu): Disabled to fix b/130808953. Need to investigate why it + # doesn't work and enable for DistributionStrategy more generally. @combinations.generate(test_combinations_for_stateful_embedding_model()) - def test_stateful_lstm_model_correctness(self, distribution, use_numpy, - use_validation_data, - run_distributed): + def disabled_test_stateful_lstm_model_correctness( + self, distribution, use_numpy, use_validation_data, run_distributed): self.run_correctness_test( distribution, use_numpy, diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py index fd2d8f04955..547a4f9cc26 100644 --- a/tensorflow/python/keras/engine/training_distributed.py +++ b/tensorflow/python/keras/engine/training_distributed.py @@ -163,8 +163,16 @@ def experimental_tpu_fit_loop(model, ValueError: in case of invalid arguments. """ mode = ModeKeys.TRAIN - # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops. + current_strategy = model._distribution_strategy + iteration_value = min(steps_per_epoch, + current_strategy.extended.steps_per_run) + steps_per_run = K.variable( + value=iteration_value, + dtype='int32', + name='steps_per_run') + + # TODO(fchollet): add support for `steps_per_epoch=None` in TPU loops. iterator = dist_utils.get_iterator(dataset, current_strategy) scope = dist_utils.distributed_scope( @@ -183,13 +191,6 @@ def experimental_tpu_fit_loop(model, tensor = m.result() initial_loop_values[m.name] = array_ops.zeros(tensor.shape, tensor.dtype) - iteration_value = min(steps_per_epoch, - current_strategy.extended.steps_per_run) - - steps_per_run = K.variable( - value=iteration_value, - dtype='int32', - name='steps_per_run') ctx = current_strategy.extended.experimental_run_steps_on_iterator( step_fn, iterator, iterations=steps_per_run, initial_loop_values=initial_loop_values) @@ -236,7 +237,7 @@ def experimental_tpu_fit_loop(model, batch_logs = {'batch': step_index, 'size': 1, 'num_steps': step_count} callbacks._call_batch_hook(mode, 'begin', step_index, batch_logs) if prev_step_count is None or step_count != prev_step_count: - steps_per_run.load(step_count, K.get_session()) + K.get_session().run(steps_per_run.assign(step_count)) prev_step_count = step_count try: _, outputs = K.batch_get_value([train_op, output_tensors]) From f0c35559f22425d66487bd6c1265c51c4edcc546 Mon Sep 17 00:00:00 2001 From: Ayush Dubey Date: Tue, 23 Jul 2019 17:30:54 -0700 Subject: [PATCH 0439/3053] Stop using deprecated `ncclBcast` and switch to new `ncclBroadcast`. `ncclBcast` was deprecated sometime ago. The new function, `ncclBroadcast` enables both in place and out of place broadcast. This change also adds tests that cover `NcclManager`'s use of `ncclBroadcast`. PiperOrigin-RevId: 259646520 --- tensorflow/core/nccl/nccl_manager.cc | 29 ++++++-- tensorflow/core/nccl/nccl_manager_test.cc | 81 +++++++++++++++++++++++ 2 files changed, 106 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc index 9f26cb2e6f7..20ba3caf9a5 100644 --- a/tensorflow/core/nccl/nccl_manager.cc +++ b/tensorflow/core/nccl/nccl_manager.cc @@ -608,10 +608,31 @@ void NcclManager::LoopKernelLaunches(NcclStream* nccl_stream) { break; } case kBroadcast: { - const Tensor* buf_t = p->input ? p->input : p->output; - void* buf = const_cast(buf_t->tensor_data().data()); - nccl_result = ncclBcast(buf, buf_t->NumElements(), data_type, - collective->root_rank, nccl_comm, *cu_stream); + const void* sendbuff = nullptr; + void* recvbuff = nullptr; + int num_elements = -1; + if (p->input) { + sendbuff = p->input->tensor_data().data(); + num_elements = p->input->NumElements(); + } + if (p->output) { + recvbuff = const_cast(p->output->tensor_data().data()); + num_elements = p->output->NumElements(); + } + if (num_elements < 0) { + p->done_callback(errors::Internal( + "Both input and output are null in ncclBroadcast")); + collective->Unref(); + continue; + } + VLOG(2) << "call NcclBroadcast collective_key " + << collective->collective_key << " participant " << p_idx + << " sendbuff " << sendbuff << " recvbuff " << recvbuff + << " nccl_comm " << nccl_comm << " comm_stream " << comm_stream + << " cuda_stream " << cu_stream; + nccl_result = + ncclBroadcast(sendbuff, recvbuff, num_elements, data_type, + collective->root_rank, nccl_comm, *cu_stream); break; } case kReduce: { diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc index fcf67c2e8b5..161a88937c3 100644 --- a/tensorflow/core/nccl/nccl_manager_test.cc +++ b/tensorflow/core/nccl/nccl_manager_test.cc @@ -178,6 +178,47 @@ class NcclManagerTest : public ::testing::Test { return test_case; } + // Make a broadcast test which broadcasts a tensor with shape `shape` from + // `src_node`, `src_rank` to all other ranks. + // If `in_place` is true, input and output are the same for the source, + // otherwise they are tensors backed by different buffers. + TestCase* MakeBroadcastTestCase(int num_nodes, int num_ranks_per_node, + TensorShape shape, int src_node, int src_rank, + bool in_place) { + TestCase* test_case = new TestCase(); + test_case->expected = Tensor(data_type_, shape); + test::FillFn(&test_case->expected, + [](int) { return static_cast(1); }); + + for (int node = 0; node < num_nodes; ++node) { + for (int local_rank = 0; local_rank < num_ranks_per_node; ++local_rank) { + auto* device = GetDevice(local_rank); + if (node == src_node && local_rank == src_rank) { + test_case->ins.emplace_back(GpuAllocator(device), data_type_, shape); + if (in_place) { + test_case->outs.emplace_back(test_case->ins.back()); + } else { + test_case->outs.emplace_back(GpuAllocator(device), data_type_, + shape); + } + Tensor in_cpu(data_type_, shape); + test::FillFn(&in_cpu, + [](int) { return static_cast(1); }); + const Tensor& in_gpu = test_case->ins.back(); + auto in_gpu_mem = AsDeviceMemory(in_gpu.flat().data()); + auto* stream = device->tensorflow_gpu_device_info()->stream; + stream->ThenMemcpy(&in_gpu_mem, in_cpu.flat().data(), + in_cpu.TotalBytes()); + } else { + test_case->ins.emplace_back(Tensor()); + test_case->outs.emplace_back(GpuAllocator(device), data_type_, shape); + } + } + } + + return test_case; + } + // Waits for the done callback to be called for each participant. void WaitForTestCompletion(TestCase* test_case) { test_case->mu.lock(); @@ -451,6 +492,46 @@ TYPED_TEST(NcclManagerTest, BasicAllGather) { } } +// Test basic broadcast. +TYPED_TEST(NcclManagerTest, BasicBroadcast) { + const int num_ranks = 4; + const int src_rank = 2; + for (int in_place_idx = 0; in_place_idx <= 1; ++in_place_idx) { + bool in_place = in_place_idx == 1; + std::unique_ptr test_case( + this->MakeBroadcastTestCase(/*num_nodes=*/1, num_ranks, + TensorShape({5, 6}), /*src_node=*/0, + src_rank, in_place)); + for (int rank = 0; rank < num_ranks; ++rank) { + auto* device = this->GetDevice(rank); + auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr; + auto* stream = device->tensorflow_gpu_device_info()->stream; + auto* input = rank == src_rank ? &test_case->ins[rank] : nullptr; + auto* output = test_case->outs[rank].NumElements() == 0 + ? nullptr + : &test_case->outs[rank]; + auto participant = absl::make_unique( + device->executor(), stream, event_mgr, device->gpu_id(), input, + output, rank, this->CreateDoneCallback(test_case.get())); + if (rank == src_rank) { + NcclManager::instance()->AddBroadcastSend( + std::move(participant), + {"broadcast", /*num_local_devices=*/num_ranks, + /*num_global_devices=*/num_ranks, + /*communicator_key=*/""}); + } else { + NcclManager::instance()->AddBroadcastRecv( + std::move(participant), + {"broadcast", /*num_local_devices=*/num_ranks, + /*num_global_devices=*/num_ranks, + /*communicator_key=*/""}); + } + } + + this->VerifyResults(test_case.get()); + } +} + // Multi-node NCCL tests. TEST(NcclManagerTest, CommunicatorKey) { From a09331a0e01c5018305bb6f1637a093ec338536d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Tue, 23 Jul 2019 17:31:24 -0700 Subject: [PATCH 0440/3053] Limit the size of the unrolled offsets array in CONVOLUTION_2D and in DEPTHWISE_CONVOLUTION. This CL resolves some pathological performance regressions observed for large convolution kernels, probably due to register file overcommit. Performance in regular case over a standard suite of CNN models remains the same or better. After extensive testing the optimum performance seems to be with the limit set to 9 (3x3). PiperOrigin-RevId: 259646598 --- .../lite/delegates/gpu/gl/kernels/conv.cc | 88 ++++++++++++------- .../gpu/gl/kernels/depthwise_conv.cc | 77 +++++++++++----- .../lite/delegates/gpu/gl/node_shader.h | 3 + 3 files changed, 114 insertions(+), 54 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc b/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc index 9a1c665f763..1025bc9a61f 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/conv.cc @@ -44,21 +44,38 @@ class Convolution : public NodeShader { ctx.node->operation.attributes); auto weights = attr.weights.shape; const int offsets_count = weights.h * weights.w; - std::vector offsets; - for (int h = 0; h < weights.h; ++h) { - for (int w = 0; w < weights.w; ++w) { - offsets.emplace_back(w * attr.dilations.w - attr.padding.prepended.w, - h * attr.dilations.h - attr.padding.prepended.h); + const bool offsets_count_too_large = offsets_count > kMaxConstArraySize; + std::vector parameters; + if (offsets_count_too_large) { + parameters = { + {"input_data_0_h", input->tensor.shape.h}, + {"input_data_0_w", input->tensor.shape.w}, + {"padding_w", attr.padding.prepended.w}, + {"padding_h", attr.padding.prepended.h}, + {"dilation_w", attr.dilations.w}, + {"dilation_h", attr.dilations.h}, + {"kernel_w", weights.w}, + {"kernel_h", weights.h}, + {"src_depth", IntegralDivideRoundUp(weights.i, 4)}, + {"stride", int2(attr.strides.w, attr.strides.h)}, + }; + } else { + std::vector offsets; + for (int h = 0; h < weights.h; ++h) { + for (int w = 0; w < weights.w; ++w) { + offsets.emplace_back(w * attr.dilations.w - attr.padding.prepended.w, + h * attr.dilations.h - attr.padding.prepended.h); + } } + parameters = { + {"input_data_0_h", input->tensor.shape.h}, + {"input_data_0_w", input->tensor.shape.w}, + {"offsets_count", offsets_count}, + {"offsets", offsets}, + {"src_depth", IntegralDivideRoundUp(weights.i, 4)}, + {"stride", int2(attr.strides.w, attr.strides.h)}, + }; } - std::vector parameters = { - {"input_data_0_h", input->tensor.shape.h}, - {"input_data_0_w", input->tensor.shape.w}, - {"offsets_count", offsets_count}, - {"offsets", offsets}, - {"src_depth", IntegralDivideRoundUp(weights.i, 4)}, - {"stride", int2(attr.strides.w, attr.strides.h)}, - }; // at least one padding is not empty bool non_empty_padding = @@ -69,9 +86,18 @@ class Convolution : public NodeShader { {"weights", MakeReadonlyObject(Get3DSizeForPHWO4I4(attr.weights.shape), ConvertToPHWO4I4(attr.weights))}}; - std::string source = R"( - for (int i = 0; i < $offsets_count$; ++i) { - ivec2 coord = gid.xy * $stride$ + $offsets[i]$;)"; + std::string source; + if (offsets_count_too_large) { + source = R"( + int i = 0; + for (int ky = 0; ky < $kernel_h$; ky++) { + for (int kx = 0; kx < $kernel_w$; kx++, i++) { + ivec2 coord = gid.xy * $stride$ + ivec2(kx * $dilation_w$ - $padding_w$, ky * $dilation_h$ - $padding_h$);)"; + } else { + source = R"( + for (int i = 0; i < $offsets_count$; ++i) { + ivec2 coord = gid.xy * $stride$ + $offsets[i]$;)"; + } if (non_empty_padding) { source += R"( if (coord.x < 0 || coord.y < 0 || coord.x >= $input_data_0_w$ || coord.y >= $input_data_0_h$) { @@ -79,29 +105,25 @@ class Convolution : public NodeShader { })"; } source += R"( - for (int l = 0; l < $src_depth$; ++l) { - highp vec4 input_ = $input_data_0[coord.x, coord.y, l]$; - value_0.x += dot(input_, $weights[l * 4 + 0, i, gid.z]$); - value_0.y += dot(input_, $weights[l * 4 + 1, i, gid.z]$); - value_0.z += dot(input_, $weights[l * 4 + 2, i, gid.z]$); - value_0.w += dot(input_, $weights[l * 4 + 3, i, gid.z]$); + for (int l = 0; l < $src_depth$; ++l) { + highp vec4 input_ = $input_data_0[coord.x, coord.y, l]$; + value_0.x += dot(input_, $weights[l * 4 + 0, i, gid.z]$); + value_0.y += dot(input_, $weights[l * 4 + 1, i, gid.z]$); + value_0.z += dot(input_, $weights[l * 4 + 2, i, gid.z]$); + value_0.w += dot(input_, $weights[l * 4 + 3, i, gid.z]$); + } } +)"; + if (offsets_count_too_large) { + source += R"( } - )"; +)"; + } if (!attr.bias.data.empty()) { source += "value_0 += $bias[gid.z]$;\n"; objects.push_back({"bias", MakeReadonlyObject(attr.bias.data)}); } - // This is a hotfix for special convolution, which worked 10ms on - // textures16. With this fix it works 4ms. - // TODO(eignasheva): fix this problem in the proper way - uint3 workgroup = uint3(0, 0, 0); - if (weights.h == 7 && weights.w == 7 && attr.strides.h == 4 && - attr.strides.w == 4) { - workgroup = uint3(8, 8, 8); - } - *generated_code = { /*parameters=*/std::move(parameters), /*objects=*/std::move(objects), @@ -110,7 +132,7 @@ class Convolution : public NodeShader { /*workgroup=*/ GetIdealWorkgroupIfPossible( ctx.gpu_info->gpu_model, OperationType::CONVOLUTION_2D, - HW(weights.h, weights.w), attr.strides, workgroup, + HW(weights.h, weights.w), attr.strides, uint3(0, 0, 0), OHWI(weights.o, input->tensor.shape.h, input->tensor.shape.w, input->tensor.shape.c)), /*source_code=*/std::move(source), diff --git a/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc index cc85211d178..4b0d279ad4f 100644 --- a/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc +++ b/tensorflow/lite/delegates/gpu/gl/kernels/depthwise_conv.cc @@ -43,23 +43,40 @@ class DepthwiseConvolution : public NodeShader { ctx.node->operation.attributes); auto weights = attr.weights.shape; const int offsets_count = weights.h * weights.w; - std::vector offsets; - for (int h = 0; h < weights.h; ++h) { - for (int w = 0; w < weights.w; ++w) { - offsets.emplace_back(w * attr.dilations.w - attr.padding.prepended.w, - h * attr.dilations.h - attr.padding.prepended.h); + const bool offsets_count_too_large = offsets_count > kMaxConstArraySize; + std::vector parameters; + if (offsets_count_too_large) { + parameters = { + {"input_data_0_h", input->tensor.shape.h}, + {"input_data_0_w", input->tensor.shape.w}, + {"padding_w", attr.padding.prepended.w}, + {"padding_h", attr.padding.prepended.h}, + {"dilation_w", attr.dilations.w}, + {"dilation_h", attr.dilations.h}, + {"kernel_w", weights.w}, + {"kernel_h", weights.h}, + {"src_depth", IntegralDivideRoundUp(weights.i, 4)}, + {"channel_multiplier", weights.o}, + {"stride", int2(attr.strides.w, attr.strides.h)}, + }; + } else { + std::vector offsets; + for (int h = 0; h < weights.h; ++h) { + for (int w = 0; w < weights.w; ++w) { + offsets.emplace_back(w * attr.dilations.w - attr.padding.prepended.w, + h * attr.dilations.h - attr.padding.prepended.h); + } } + parameters = { + {"input_data_0_h", input->tensor.shape.h}, + {"input_data_0_w", input->tensor.shape.w}, + {"offsets_count", offsets_count}, + {"offsets", offsets}, + {"src_depth", IntegralDivideRoundUp(weights.i, 4)}, + {"channel_multiplier", weights.o}, + {"stride", int2(attr.strides.w, attr.strides.h)}, + }; } - std::vector parameters = { - {"input_data_0_h", input->tensor.shape.h}, - {"input_data_0_w", input->tensor.shape.w}, - {"offsets_count", offsets_count}, - {"offsets", offsets}, - {"src_depth", IntegralDivideRoundUp(weights.i, 4)}, - {"channel_multiplier", weights.o}, - {"stride", int2(attr.strides.w, attr.strides.h)}, - }; - bool non_empty_padding = attr.padding.appended.h != 0 || attr.padding.appended.w != 0 || attr.padding.prepended.h != 0 || attr.padding.prepended.w != 0; @@ -67,11 +84,24 @@ class DepthwiseConvolution : public NodeShader { std::vector> objects = { {"weights", MakeReadonlyObject(ConvertToPIOHW4(attr.weights))}}; - std::string source = R"( - int src_layer_offset = (gid.z % $channel_multiplier$) * 4; - int filter_offset = gid.z * $src_depth$ * $offsets_count$ * 4; - for (int i = 0; i < $offsets_count$; ++i) { - ivec2 coord = gid.xy * $stride$ + $offsets[i]$;)"; + std::string source; + if (offsets_count_too_large) { + source = R"( + int offsets_count = $kernel_w$ * $kernel_h$; + int src_layer_offset = (gid.z % $channel_multiplier$) * 4; + int filter_offset = gid.z * $src_depth$ * offsets_count * 4; + int i = 0; + for (int ky = 0; ky < $kernel_h$; ky++) { + for (int kx = 0; kx < $kernel_w$; kx++, i++) { + ivec2 coord = gid.xy * $stride$ + ivec2(kx * $dilation_w$ - $padding_w$, ky * $dilation_h$ - $padding_h$);)"; + } else { + source = R"( + int offsets_count = $offsets_count$; + int src_layer_offset = (gid.z % $channel_multiplier$) * 4; + int filter_offset = gid.z * $src_depth$ * offsets_count * 4; + for (int i = 0; i < offsets_count; ++i) { + ivec2 coord = gid.xy * $stride$ + $offsets[i]$;)"; + } if (non_empty_padding) { source += R"( if (coord.x < 0 || coord.y < 0 || @@ -87,10 +117,15 @@ class DepthwiseConvolution : public NodeShader { input_shifted[1] = input_[(src_layer_offset + 1) / $channel_multiplier$]; input_shifted[2] = input_[(src_layer_offset + 2) / $channel_multiplier$]; input_shifted[3] = input_[(src_layer_offset + 3) / $channel_multiplier$]; - int filter_offset = gid.z * $offsets_count$ + i; + int filter_offset = gid.z * offsets_count + i; value_0 += input_shifted * $weights[filter_offset]$; } )"; + if (offsets_count_too_large) { + source += R"( + } +)"; + } if (!attr.bias.data.empty()) { source += "value_0 += $bias[gid.z]$;\n"; objects.push_back({"bias", MakeReadonlyObject(attr.bias.data)}); diff --git a/tensorflow/lite/delegates/gpu/gl/node_shader.h b/tensorflow/lite/delegates/gpu/gl/node_shader.h index 310719e23c9..0225a7cee73 100644 --- a/tensorflow/lite/delegates/gpu/gl/node_shader.h +++ b/tensorflow/lite/delegates/gpu/gl/node_shader.h @@ -103,6 +103,9 @@ class NodeShader { // Generates shader code for a node. The code should be just a function body. virtual Status GenerateCode(const GenerationContext& ctx, GeneratedCode* generated_code) const = 0; + + // Limit the size of the const offsets array + static constexpr int kMaxConstArraySize = 9; }; } // namespace gl From 71dbe5059a1da38c0fe483fd94d4fb014e068f07 Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Tue, 23 Jul 2019 18:06:12 -0700 Subject: [PATCH 0441/3053] Key dictionaries of Tensor id instead of hash PiperOrigin-RevId: 259651701 --- .../python/framework/auto_control_deps.py | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py index 1c16d38cbda..1d2757bdacf 100644 --- a/tensorflow/python/framework/auto_control_deps.py +++ b/tensorflow/python/framework/auto_control_deps.py @@ -209,10 +209,13 @@ class AutomaticControlDependencies(object): all usages of it. """ inp = switch_op.inputs[0] + input_id = ops.tensor_id(inp) if inp.dtype == dtypes_module.resource and inp.op.type == "Switch": self._process_switch(inp.op, ops_which_must_run, last_op_using_resource_tensor, merge_for_resource) - if switch_op.outputs[0] in merge_for_resource: + output = switch_op.outputs[0] + output_id = ops.tensor_id(output) + if output_id in merge_for_resource: return new_merge = control_flow_ops.merge(switch_op.outputs, name="artificial_merge") @@ -220,16 +223,16 @@ class AutomaticControlDependencies(object): switch_op._control_flow_context.outer_context) # pylint: disable=protected-access # Ensures the merge always runs ops_which_must_run.add(new_merge[0].op) - if inp in last_op_using_resource_tensor: + if input_id in last_op_using_resource_tensor: # Ensures the switch executes after the previous op using the resource. - switch_op._add_control_input(last_op_using_resource_tensor[inp]) # pylint: disable=protected-access + switch_op._add_control_input(last_op_using_resource_tensor[input_id]) # pylint: disable=protected-access # Ensure the next op outside the cond happens after the merge. - last_op_using_resource_tensor[inp] = new_merge[0].op - if inp in merge_for_resource: - merge_for_resource[inp]._add_control_input(new_merge[0].op) # pylint: disable=protected-access + last_op_using_resource_tensor[input_id] = new_merge[0].op + if input_id in merge_for_resource: + merge_for_resource[input_id]._add_control_input(new_merge[0].op) # pylint: disable=protected-access for o in switch_op.outputs: # Ensures the merge will execute after all ops inside the cond - merge_for_resource[o] = new_merge[0].op + merge_for_resource[ops.tensor_id(o)] = new_merge[0].op def __exit__(self, unused_type, unused_value, unused_traceback): if context.executing_eagerly(): @@ -301,8 +304,9 @@ class AutomaticControlDependencies(object): for o in ops_which_must_run: op._add_control_input(o) # pylint: disable=protected-access for inp in o.inputs: - if inp in last_op_using_resource_tensor: - last_op_using_resource_tensor[inp] = op + input_id = ops.tensor_id(inp) + if input_id in last_op_using_resource_tensor: + last_op_using_resource_tensor[input_id] = op ops_which_must_run = set([op]) continue @@ -313,26 +317,28 @@ class AutomaticControlDependencies(object): if inp.dtype != dtypes_module.resource: continue + input_id = ops.tensor_id(inp) + # If the op receives the same resource tensor twice as an input, we skip # to avoid the op getting a control dependency on itself. - if id(inp) in resource_inputs: + if input_id in resource_inputs: continue - resource_inputs.add(id(inp)) + resource_inputs.add(input_id) # Deal with switches, finally. if inp.op.type == "Switch": self._process_switch(inp.op, ops_which_must_run, last_op_using_resource_tensor, merge_for_resource) # Ensure uses of resources are serialized - if inp in last_op_using_resource_tensor: - if (last_op_using_resource_tensor[inp]._control_flow_context # pylint: disable=protected-access + if input_id in last_op_using_resource_tensor: + if (last_op_using_resource_tensor[input_id]._control_flow_context # pylint: disable=protected-access is op._control_flow_context): # pylint: disable=protected-access - control_inputs.add(last_op_using_resource_tensor[inp]) + control_inputs.add(last_op_using_resource_tensor[input_id]) # Ensure merges happen after the closing of a cond block - if inp in merge_for_resource: - merge_for_resource[inp]._add_control_input(op) # pylint: disable=protected-access - last_op_using_resource_tensor[inp] = op + if input_id in merge_for_resource: + merge_for_resource[input_id]._add_control_input(op) # pylint: disable=protected-access + last_op_using_resource_tensor[input_id] = op if (op_is_stateful(op) and not resource_inputs and op._control_flow_context is None): # pylint: disable=protected-access From d6ca609b218fa87bb9f8e32b5b88d48720be47cf Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Tue, 23 Jul 2019 18:15:54 -0700 Subject: [PATCH 0442/3053] Add more tests to show tensor equality changes PiperOrigin-RevId: 259653024 --- tensorflow/python/eager/core_test.py | 241 ++++++++++++++++++--------- 1 file changed, 160 insertions(+), 81 deletions(-) diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py index 7958f7ee15e..f2e77fe4a90 100644 --- a/tensorflow/python/eager/core_test.py +++ b/tensorflow/python/eager/core_test.py @@ -92,101 +92,180 @@ class TFETest(test_util.TensorFlowTestCase): def testEquality(self): default = ops.Tensor._USE_EQUALITY - def _v1_check(a, b): - self.assertEqual(a, a) - self.assertIs(a, a) - self.assertNotEqual(a, 1.0) - self.assertIsNot(a, 1.0) - self.assertNotEqual(a, b) - self.assertIsNot(a, b) + try: + def _v1_check(a, b): + self.assertEqual(a, a) + self.assertIs(a, a) + self.assertNotEqual(a, 1.0) + self.assertIsNot(a, 1.0) + self.assertNotEqual(a, b) + self.assertIsNot(a, b) - def _v2_check(a, b): - self.assertEqual(a, a) - self.assertIs(a, a) - self.assertEqual(a, 1.0) - self.assertIsNot(a, 1.0) - self.assertEqual(a, b) - self.assertIsNot(a, b) + def _v2_check(a, b): + self.assertEqual(a, a) + self.assertIs(a, a) + self.assertEqual(a, 1.0) + self.assertIsNot(a, 1.0) + self.assertEqual(a, b) + self.assertIsNot(a, b) - constant_a = constant_op.constant(1.0) - constant_b = constant_op.constant(1.0) + constant_a = constant_op.constant(1.0) + constant_b = constant_op.constant(1.0) - ops.disable_tensor_equality() - self._test_hashable(constant_a, constant_b, True) - _v1_check(constant_a, constant_b) - ops.enable_tensor_equality() - _v2_check(constant_a, constant_b) - self._test_hashable(constant_a, constant_b, False) - - variable_a = variables.Variable(1.0) - variable_b = variables.Variable(1.0) - - ops.disable_tensor_equality() - _v1_check(variable_a, variable_b) - self._test_hashable(variable_a, variable_b, True) - ops.enable_tensor_equality() - _v2_check(variable_a, variable_b) - self._test_hashable(variable_a, variable_b, True) - - if default: - ops.enable_tensor_equality() - else: ops.disable_tensor_equality() + self._test_hashable(constant_a, constant_b, True) + _v1_check(constant_a, constant_b) + ops.enable_tensor_equality() + _v2_check(constant_a, constant_b) + self._test_hashable(constant_a, constant_b, False) - # We only test numpy behaviour in v2 mode since we'd like to match that. - numpy_a = np.array(1.0) - numpy_b = np.array(1.0) - _v2_check(numpy_a, numpy_b) - self._test_hashable(numpy_a, numpy_b, False) + variable_a = variables.Variable(1.0) + variable_b = variables.Variable(1.0) + + ops.disable_tensor_equality() + _v1_check(variable_a, variable_b) + self._test_hashable(variable_a, variable_b, True) + ops.enable_tensor_equality() + _v2_check(variable_a, variable_b) + self._test_hashable(variable_a, variable_b, True) + + # We only test numpy behaviour in v2 mode since we'd like to match that. + numpy_a = np.array(1.0) + numpy_b = np.array(1.0) + _v2_check(numpy_a, numpy_b) + self._test_hashable(numpy_a, numpy_b, False) + finally: + if default: + ops.enable_tensor_equality() + else: + ops.disable_tensor_equality() def testEqualityNan(self): default = ops.Tensor._USE_EQUALITY - def _v1_check(a, b): - self.assertEqual(a, a) - self.assertIs(a, a) - self.assertNotEqual(a, float('nan')) - self.assertIsNot(a, float('nan')) - self.assertNotEqual(a, b) - self.assertIsNot(a, b) + try: + def _v1_check(a, b): + self.assertEqual(a, a) + self.assertIs(a, a) + self.assertNotEqual(a, float('nan')) + self.assertIsNot(a, float('nan')) + self.assertNotEqual(a, b) + self.assertIsNot(a, b) - def _v2_check(a, b): - self.assertNotEqual(a, a) - self.assertIs(a, a) - self.assertNotEqual(a, float('nan')) - self.assertIsNot(a, float('nan')) - self.assertNotEqual(a, b) - self.assertIsNot(a, b) + def _v2_check(a, b): + self.assertNotEqual(a, a) + self.assertIs(a, a) + self.assertNotEqual(a, float('nan')) + self.assertIsNot(a, float('nan')) + self.assertNotEqual(a, b) + self.assertIsNot(a, b) - constant_a = constant_op.constant(float('nan')) - constant_b = constant_op.constant(float('nan')) + constant_a = constant_op.constant(float('nan')) + constant_b = constant_op.constant(float('nan')) - ops.disable_tensor_equality() - self._test_hashable(constant_a, constant_b, True) - _v1_check(constant_a, constant_b) - ops.enable_tensor_equality() - _v2_check(constant_a, constant_b) - self._test_hashable(constant_a, constant_b, False) - - variable_a = variables.Variable(float('nan')) - variable_b = variables.Variable(float('nan')) - - ops.disable_tensor_equality() - _v1_check(variable_a, variable_b) - self._test_hashable(variable_a, variable_b, True) - ops.enable_tensor_equality() - _v2_check(variable_a, variable_b) - self._test_hashable(variable_a, variable_b, True) - - if default: - ops.enable_tensor_equality() - else: ops.disable_tensor_equality() + self._test_hashable(constant_a, constant_b, True) + _v1_check(constant_a, constant_b) + ops.enable_tensor_equality() + _v2_check(constant_a, constant_b) + self._test_hashable(constant_a, constant_b, False) - numpy_a = np.array(float('nan')) - numpy_b = np.array(float('nan')) - _v2_check(numpy_a, numpy_b) - self._test_hashable(numpy_a, numpy_b, False) + variable_a = variables.Variable(float('nan')) + variable_b = variables.Variable(float('nan')) + + ops.disable_tensor_equality() + _v1_check(variable_a, variable_b) + self._test_hashable(variable_a, variable_b, True) + ops.enable_tensor_equality() + _v2_check(variable_a, variable_b) + self._test_hashable(variable_a, variable_b, True) + + numpy_a = np.array(float('nan')) + numpy_b = np.array(float('nan')) + _v2_check(numpy_a, numpy_b) + self._test_hashable(numpy_a, numpy_b, False) + finally: + if default: + ops.enable_tensor_equality() + else: + ops.disable_tensor_equality() + + def testEqualityCompare(self): + default = ops.Tensor._USE_EQUALITY + + try: + tf_a = constant_op.constant([1, 2]) + tf_b = constant_op.constant([1, 2]) + tf_c = constant_op.constant([1, 1]) + np_a = np.array([1, 2]) + np_b = np.array([1, 2]) + np_c = np.array([1, 1]) + + ops.disable_tensor_equality() + # We don't do element-wise comparison + self.assertNotEqual(tf_a, tf_b) + self.assertNotEqual(tf_a, tf_c) + + # We can compare list of tensors + self.assertEqual([tf_a, tf_b], [tf_a, tf_b]) + self.assertNotEqual([tf_a, tf_b], [tf_b, tf_b]) + + # We can compare existence in a list + self.assertIn(tf_a, [tf_a, tf_b]) + self.assertIn(tf_a, [tf_b, tf_a]) + self.assertNotIn(tf_a, [tf_b, tf_c]) + + ops.enable_tensor_equality() + # We do element-wise comparison but can't convert results array to bool + with self.assertRaises(ValueError): + bool(tf_a == tf_b) + self.assertAllEqual(tf_a == tf_b, [True, True]) + with self.assertRaises(ValueError): + bool(tf_a == tf_c) + self.assertAllEqual(tf_a == tf_c, [True, False]) + with self.assertRaises(ValueError): + bool(np_a == np_b) + self.assertAllEqual(np_a == np_b, [True, True]) + with self.assertRaises(ValueError): + bool(np_a == np_c) + self.assertAllEqual(np_a == np_c, [True, False]) + + # Warning even though we technically shouldn't be able to compare here, + # since the id is the same both TF & numpy will handle lists with the same + # value without raising an error + self.assertEqual([tf_a, tf_b], [tf_a, tf_b]) + with self.assertRaises(ValueError): + bool([tf_a, tf_b] == [tf_b, tf_b]) + self.assertEqual([np_a, np_b], [np_a, np_b]) + with self.assertRaises(ValueError): + bool([np_a, np_b] == [np_b, np_b]) + + # Similar to lists we shouldn't be able to do a `in` check such as + # `if a in [a,b]`. However if `a` is the first element, it works due to + # short circuiting + self.assertIn(tf_a, [tf_a, tf_b]) + with self.assertRaises(ValueError): + bool(tf_a in [tf_b, tf_a]) + with self.assertRaises(ValueError): + bool(tf_a in [tf_b, tf_c]) + self.assertIn(np_a, [np_a, np_b]) + with self.assertRaises(ValueError): + bool(np_a in [np_b, np_a]) + with self.assertRaises(ValueError): + bool(np_a in [np_b, np_c]) + + # rank 0 + self.assertAllEqual( + constant_op.constant(1) == constant_op.constant(1), True) + self.assertAllEqual( + constant_op.constant(1) == constant_op.constant(2), False) + self.assertAllEqual(np.array(1) == np.array(1), True) + self.assertAllEqual(np.array(1) == np.array(2), False) + finally: + if default: + ops.enable_tensor_equality() + else: + ops.disable_tensor_equality() def testContext(self): ctx = context.Context() From 571328a56540c26926a80fb5adedf97f6f3bf6ce Mon Sep 17 00:00:00 2001 From: Yuanzhong Xu Date: Tue, 23 Jul 2019 18:21:02 -0700 Subject: [PATCH 0443/3053] XLA compiler: allow non-MAXIMAL arg/retval sharding annotation. PiperOrigin-RevId: 259653638 --- tensorflow/compiler/tf2xla/sharding_util.cc | 2 +- tensorflow/compiler/tf2xla/xla_compiler.cc | 97 ++++++++++----------- tensorflow/compiler/tf2xla/xla_compiler.h | 2 +- 3 files changed, 48 insertions(+), 53 deletions(-) diff --git a/tensorflow/compiler/tf2xla/sharding_util.cc b/tensorflow/compiler/tf2xla/sharding_util.cc index 8aae498be10..4d5bf0835e1 100644 --- a/tensorflow/compiler/tf2xla/sharding_util.cc +++ b/tensorflow/compiler/tf2xla/sharding_util.cc @@ -53,7 +53,7 @@ xla::StatusOr> ParseShardingFromDevice( const string& device_name, int num_cores_per_replica, absl::optional explicit_sharding) { if (device_name.empty()) { - return absl::optional(); + return explicit_sharding; } DeviceNameUtils::ParsedName parsed_device; if (!DeviceNameUtils::ParseFullName(device_name, &parsed_device)) { diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc index 2ee8c7e5cfb..3959f130c20 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler.cc @@ -76,41 +76,38 @@ Status CheckSignature(const DataTypeVector& types, return Status::OK(); } -// Uses the _Arg and _Retval nodes in the graph to determine a core assignment -// for each argument and return value. -xla::StatusOr, std::map>> -ComputeArgAndRetvalCores(const Graph& graph) { - auto get_sharding_for_node = [](const Node* n) -> xla::StatusOr { +// Uses the _Arg and _Retval nodes in the graph to determine an OpSharding for +// each argument and return value. +xla::StatusOr< + std::pair, std::map>> +ComputeArgAndRetvalShardings(const Graph& graph) { + auto get_sharding_for_node = + [](const Node* n) -> xla::StatusOr> { TF_ASSIGN_OR_RETURN( auto sharding, ParseShardingFromDevice(*n, std::numeric_limits::max())); - if (sharding.has_value()) { - TF_RET_CHECK(sharding.value().type() == xla::OpSharding::MAXIMAL); - return sharding.value().tile_assignment_devices(0); - } else { - return -1; - } + return sharding; }; - std::map arg_cores; - std::map retval_cores; + std::map arg_shardings; + std::map retval_shardings; for (const Node* n : graph.nodes()) { if (n->IsArg()) { - TF_ASSIGN_OR_RETURN(int core, get_sharding_for_node(n)); - if (core < 0) continue; + TF_ASSIGN_OR_RETURN(auto sharding, get_sharding_for_node(n)); + if (!sharding.has_value()) continue; int index; TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index)); TF_RET_CHECK(index >= 0) << "Negative _Arg index"; - arg_cores[index] = core; + arg_shardings[index] = std::move(*sharding); } else if (n->IsRetval()) { - TF_ASSIGN_OR_RETURN(int core, get_sharding_for_node(n)); - if (core < 0) continue; + TF_ASSIGN_OR_RETURN(auto sharding, get_sharding_for_node(n)); + if (!sharding.has_value()) continue; int index; TF_RETURN_IF_ERROR(GetNodeAttr(n->attrs(), "index", &index)); TF_RET_CHECK(index >= 0) << "Negative _Retval index"; - retval_cores[index] = core; + retval_shardings[index] = std::move(*sharding); } } - return std::make_pair(std::move(arg_cores), std::move(retval_cores)); + return std::make_pair(std::move(arg_shardings), std::move(retval_shardings)); } Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr graph, @@ -144,8 +141,8 @@ Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr graph, // - `args` is the list of input arguments // - `retvals` is the list of retvals produced by _Retval operators, in index // order. -// - `args_core` and `retval_cores` are mapping from arg/return indices to core -// assignments. +// - `arg_shardings` and `retval_shardings` are mapping from arg/return indices +// to sharding. // - If `return_updated_values_for_all_resources` is true, all resources will be // included in `resource_updates`, regardless of whether their value changed. // - Sets `*num_nonconst_outputs` to the number of outputs of the `computation`. @@ -158,7 +155,8 @@ Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr graph, Status BuildComputation( const std::vector& args, const std::vector& retvals, - const std::map& arg_cores, const std::map& retval_cores, + const std::map& arg_shardings, + const std::map& retval_shardings, const std::vector>& resources, std::unique_ptr token_output, const XlaCompiler::ShapeRepresentationFn& shape_representation_fn, @@ -212,11 +210,11 @@ Status BuildComputation( output.is_constant = false; TF_ASSIGN_OR_RETURN(output.shape, retval.GetShape()); xla::XlaOp value = retval.handle(); - auto it = retval_cores.find(i); + auto it = retval_shardings.find(i); xla::XlaScopedShardingAssignment assign_sharding( - builder, it == retval_cores.end() + builder, it == retval_shardings.end() ? absl::optional() - : xla::sharding_builder::AssignDevice(it->second)); + : it->second); if (shape_representation_fn) { // If there is a shape representation function, reshape the output // tensor to the shape given by the representation shape function. @@ -224,7 +222,7 @@ Status BuildComputation( output.shape, output.type)); value = xla::Reshape(value, xla::AsInt64Slice(shape.dimensions())); retval_index_and_layout.emplace_back(elems.size(), shape.layout()); - } else if (it != retval_cores.end()) { + } else if (it != retval_shardings.end()) { // Apply the sharding to the output, if there is a core assignment. value = identity_op(value); } @@ -265,8 +263,7 @@ Status BuildComputation( for (const XlaResource* resource : arg_resources) { DCHECK_LT(resource->arg_num(), args.size()); const XlaCompiler::Argument& arg = args[resource->arg_num()]; - auto it = arg_cores.find(resource->arg_num()); - const int core = it == arg_cores.end() ? -1 : it->second; + auto it = arg_shardings.find(resource->arg_num()); bool modified = !resource->value().IsIdenticalTo(resource->initial_value()); // TensorArray gradients were modified if their values changed or there are // any newly created gradients. @@ -289,8 +286,8 @@ Status BuildComputation( // Request that the value be returned on a specific core. xla::XlaScopedShardingAssignment assign_sharding( - builder, core == -1 ? absl::optional() - : xla::sharding_builder::AssignDevice(core)); + builder, it == arg_shardings.end() ? absl::optional() + : it->second); xla::XlaOp handle; TF_RETURN_IF_ERROR(resource->Pack(&handle, builder)); @@ -742,7 +739,7 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg, Status XlaCompiler::BuildArguments( const Graph& graph, const std::vector& args, bool use_tuple_arg, xla::XlaBuilder* builder, XlaContext* context, - const std::map& arg_cores, + const std::map& arg_shardings, std::vector* arg_expressions, std::vector* input_to_args, std::vector* input_shapes, bool is_entry_computation) { @@ -833,10 +830,10 @@ Status XlaCompiler::BuildArguments( xla::OpSharding tuple_sharding; tuple_sharding.set_type(xla::OpSharding::TUPLE); for (int64 parameter : *input_to_args) { - auto it = arg_cores.find(parameter); - const int core = it == arg_cores.end() ? 0 : it->second; + auto it = arg_shardings.find(parameter); *tuple_sharding.add_tuple_shardings() = - xla::sharding_builder::AssignDevice(core); + it == arg_shardings.end() ? xla::sharding_builder::AssignDevice(0) + : it->second; } std::vector is_same_across_replicas; for (int i = 0; i < input_to_args->size(); ++i) { @@ -867,20 +864,18 @@ Status XlaCompiler::BuildArguments( } for (std::vector::size_type i = 0; i < input_to_args->size(); ++i) { - auto it = arg_cores.find(i); - const int core = it == arg_cores.end() ? -1 : it->second; + auto it = arg_shardings.find(i); xla::XlaScopedShardingAssignment assign_sharding( - builder, core == -1 ? absl::optional() - : xla::sharding_builder::AssignDevice(core)); + builder, it == arg_shardings.end() ? absl::optional() + : it->second); arg_handles[i] = xla::GetTupleElement(tuple, i); } } else { for (std::vector::size_type i = 0; i < input_to_args->size(); ++i) { - auto it = arg_cores.find(i); - const int core = it == arg_cores.end() ? -1 : it->second; + auto it = arg_shardings.find(i); xla::XlaScopedShardingAssignment assign_sharding( - builder, core == -1 ? absl::optional() - : xla::sharding_builder::AssignDevice(core)); + builder, it == arg_shardings.end() ? absl::optional() + : it->second); if (is_entry_computation) { // Add an entry to is_same_across_replicas for every leaf buffer. std::vector is_same_across_replicas( @@ -1155,16 +1150,16 @@ Status XlaCompiler::CompileGraph( real_args.push_back(token_arg); } - std::map arg_cores; - std::map retval_cores; - TF_ASSIGN_OR_RETURN(std::tie(arg_cores, retval_cores), - ComputeArgAndRetvalCores(*graph)); + std::map arg_shardings; + std::map retval_shardings; + TF_ASSIGN_OR_RETURN(std::tie(arg_shardings, retval_shardings), + ComputeArgAndRetvalShardings(*graph)); std::vector arg_expressions; TF_RETURN_IF_ERROR(BuildArguments( - *graph, real_args, options.use_tuple_arg, &builder, context, arg_cores, - &arg_expressions, &result->input_mapping, &result->xla_input_shapes, - options.is_entry_computation)); + *graph, real_args, options.use_tuple_arg, &builder, context, + arg_shardings, &arg_expressions, &result->input_mapping, + &result->xla_input_shapes, options.is_entry_computation)); context->set_args(std::move(arg_expressions)); // Propagate any aliases given to us by the user. @@ -1233,7 +1228,7 @@ Status XlaCompiler::CompileGraph( ConvertConstantsToExpressions(&builder, absl::Span(retvals)); } TF_RETURN_IF_ERROR(BuildComputation( - real_args, retvals, arg_cores, retval_cores, context->resources(), + real_args, retvals, arg_shardings, retval_shardings, context->resources(), std::move(token_output), options.is_entry_computation ? options_.shape_representation_fn : ShapeRepresentationFn{}, diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h index 1cc5d8d4728..55220060e93 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.h +++ b/tensorflow/compiler/tf2xla/xla_compiler.h @@ -446,7 +446,7 @@ class XlaCompiler { const std::vector& args, bool use_tuple_arg, xla::XlaBuilder* builder, XlaContext* context, - const std::map& arg_cores, + const std::map& arg_shardings, std::vector* arg_expressions, std::vector* input_to_args, std::vector* input_shapes, From 150a6c06b281246cb5a075a704fceeb257bb63af Mon Sep 17 00:00:00 2001 From: Jian Li Date: Tue, 23 Jul 2019 19:48:41 -0700 Subject: [PATCH 0444/3053] Add a check on the 0th dimension of filter for DepthwiseConv. PiperOrigin-RevId: 259662414 --- tensorflow/lite/kernels/depthwise_conv.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc index bfa3697c0a9..1f50b3741d6 100644 --- a/tensorflow/lite/kernels/depthwise_conv.cc +++ b/tensorflow/lite/kernels/depthwise_conv.cc @@ -113,6 +113,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { data_type == kTfLiteInt8); TF_LITE_ENSURE_EQ(context, output->type, data_type); TF_LITE_ENSURE_EQ(context, filter->type, data_type); + // Filter in DepthwiseConv is expected to be [1, H, W, O]. + TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 0), 1); if (hasBias) { bias = GetInput(context, node, kBiasTensor); From 805b28132ee79c2db8023d25774c56cd399f5b88 Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Tue, 23 Jul 2019 20:56:05 -0700 Subject: [PATCH 0445/3053] Prevent test failures by manually triggering Python garbage collector before resetting the server def. Due to current implementation of set_server_def, resources might be leaked and destroyed after the device manager (and devices) are released. When there are multiple set_server_def calls, this leads to non-deterministic segfaults when the Python GC starts to clean up hanging resources. PiperOrigin-RevId: 259668467 --- tensorflow/python/eager/benchmarks_test.py | 29 ++++++++++++++++------ 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py index 7113144d237..615e8a81136 100644 --- a/tensorflow/python/eager/benchmarks_test.py +++ b/tensorflow/python/eager/benchmarks_test.py @@ -25,6 +25,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import gc import os import time @@ -1114,8 +1115,7 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark): wall_time=mean_us, extras={"examples_per_sec": num_iters / total_time}) - # TODO(b/136184459): Re-enabled once crash is fixed - def _DISABLED_benchmark_send_mirroring_off(self): + def benchmark_send_mirroring_off(self): remote.connect_to_remote_host(self._cached_server_target1) x = random_ops.random_uniform((2, 2)).cpu() @@ -1130,9 +1130,12 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark): context.context().mirroring_policy = context.MIRRORING_NONE self._run(lambda: func(x)) + # NOTE(b/136184459): Force garbage collecting hanging resources before + # subsequent calls to set_server_def, to ensure the destroy resource ops are + # executed when their corresponding device and manager are still available. + gc.collect() - # TODO(b/136184459): Re-enabled once crash is fixed - def _DISABLED_benchmark_send_mirroring_on(self): + def benchmark_send_mirroring_on(self): remote.connect_to_remote_host(self._cached_server_target1) x = random_ops.random_uniform((2, 2)).cpu() @@ -1147,9 +1150,12 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark): context.context().mirroring_policy = context.MIRRORING_ALL self._run(lambda: func(x)) + # NOTE(b/136184459): Force garbage collecting hanging resources before + # subsequent calls to set_server_def, to ensure the destroy resource ops are + # executed when their corresponding device and manager are still available. + gc.collect() - # TODO(b/136184459): Re-enabled once crash is fixed - def _DISABLED_benchmark_worker_mirroring_off(self): + def benchmark_worker_mirroring_off(self): remote.connect_to_remote_host( [self._cached_server_target1, self._cached_server_target2]) @@ -1166,9 +1172,12 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark): context.context().mirroring_policy = context.MIRRORING_NONE self._run(func) + # NOTE(b/136184459): Force garbage collecting hanging resources before + # subsequent calls to set_server_def, to ensure the destroy resource ops are + # executed when their corresponding device and manager are still available. + gc.collect() - # TODO(b/136184459): Re-enabled once crash is fixed - def _DISABLED_benchmark_worker_mirroring_on(self): + def benchmark_worker_mirroring_on(self): remote.connect_to_remote_host( [self._cached_server_target1, self._cached_server_target2]) @@ -1185,6 +1194,10 @@ class RemoteWorkerMicroBenchmarks(test.Benchmark): context.context().mirroring_policy = context.MIRRORING_ALL self._run(func) + # NOTE(b/136184459): Force garbage collecting hanging resources before + # subsequent calls to set_server_def, to ensure the destroy resource ops are + # executed when their corresponding device and manager are still available. + gc.collect() if __name__ == "__main__": From 2a4b5a3f239b667e2720e73b3048c9896659b0bb Mon Sep 17 00:00:00 2001 From: Ayush Dubey Date: Tue, 23 Jul 2019 20:58:58 -0700 Subject: [PATCH 0446/3053] Add an unbounded work queue based on the existing `UnboundedThreadPool` implementation. This change adds `UnboundedWorkQueue` to tensorflow/core/platform for general use in TensorFlow runtime. The implementation is basically the same as the existing tf.data unbounded thread pool. After this change, `UnboundedThreadPool` is a thin wrapper around `UnboundedWorkQueue`. PiperOrigin-RevId: 259668662 --- tensorflow/core/BUILD | 32 ++++++ tensorflow/core/kernels/data/BUILD | 1 + .../kernels/data/unbounded_thread_pool.cc | 97 +++------------- .../core/kernels/data/unbounded_thread_pool.h | 36 ++---- .../data/unbounded_thread_pool_test.cc | 62 +---------- .../platform/default/unbounded_work_queue.cc | 101 +++++++++++++++++ .../platform/default/unbounded_work_queue.h | 65 +++++++++++ .../core/platform/unbounded_work_queue.h | 33 ++++++ .../platform/unbounded_work_queue_test.cc | 104 ++++++++++++++++++ 9 files changed, 357 insertions(+), 174 deletions(-) create mode 100644 tensorflow/core/platform/default/unbounded_work_queue.cc create mode 100644 tensorflow/core/platform/default/unbounded_work_queue.h create mode 100644 tensorflow/core/platform/unbounded_work_queue.h create mode 100644 tensorflow/core/platform/unbounded_work_queue_test.cc diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 89b9e2fb73f..edd9e05b1af 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -626,6 +626,38 @@ filegroup( visibility = ["//visibility:private"], ) +cc_library( + name = "platform_unbounded_work_queue", + srcs = tf_platform_srcs([ + "unbounded_work_queue.cc", + ]) + tf_platform_hdrs([ + "unbounded_work_queue.h", + ]), + hdrs = ["platform/unbounded_work_queue.h"], + deps = [ + ":core_cpu_internal", + ":framework", + ":lib", + "@com_google_absl//absl/memory", + ], +) + +tf_cc_test( + name = "platform_unbounded_work_queue_test", + srcs = ["platform/unbounded_work_queue_test.cc"], + deps = [ + ":framework", + ":lib", + ":lib_internal", + ":lib_test_internal", + ":platform_unbounded_work_queue", + ":protos_all_cc", + ":test", + ":test_main", + "@com_google_absl//absl/memory", + ], +) + # Headers that are not exported as part of ":lib". filegroup( name = "platform_other_internal_hdrs", diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD index a5f41b6dcae..8905641536e 100644 --- a/tensorflow/core/kernels/data/BUILD +++ b/tensorflow/core/kernels/data/BUILD @@ -180,6 +180,7 @@ cc_library( "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", "//tensorflow/core:lib", + "//tensorflow/core:platform_unbounded_work_queue", "@com_google_absl//absl/memory", ], ) diff --git a/tensorflow/core/kernels/data/unbounded_thread_pool.cc b/tensorflow/core/kernels/data/unbounded_thread_pool.cc index ac12197f1b8..9bb8f4e92e6 100644 --- a/tensorflow/core/kernels/data/unbounded_thread_pool.cc +++ b/tensorflow/core/kernels/data/unbounded_thread_pool.cc @@ -16,8 +16,9 @@ limitations under the License. #include "tensorflow/core/kernels/data/unbounded_thread_pool.h" #include "absl/memory/memory.h" +#include "tensorflow/core/lib/core/notification.h" #include "tensorflow/core/platform/env.h" -#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/unbounded_work_queue.h" namespace tensorflow { namespace data { @@ -30,7 +31,7 @@ class UnboundedThreadPool::LogicalThreadFactory : public ThreadFactory { std::unique_ptr StartThread(const string& name, std::function fn) override { - return pool_->RunOnPooledThread(std::move(fn)); + return pool_->ScheduleOnWorkQueue(std::move(fn)); } private: @@ -52,8 +53,7 @@ class UnboundedThreadPool::LogicalThreadWrapper : public Thread { // NOTE: The `Thread` destructor is expected to "join" the created thread, // but the physical thread may continue to execute after the work for this // thread is complete. We simulate this by waiting on a notification that - // the `CachedThreadFunc` will notify when the thread's work function is - // complete. + // the thread's work function will notify when it is complete. join_notification_->WaitForNotification(); } @@ -61,96 +61,25 @@ class UnboundedThreadPool::LogicalThreadWrapper : public Thread { std::shared_ptr join_notification_; }; -UnboundedThreadPool::~UnboundedThreadPool() { - { - mutex_lock l(work_queue_mu_); - // Wake up all `CachedThreadFunc` threads and cause them to terminate before - // joining them when `threads_` is cleared. - cancelled_ = true; - work_queue_cv_.notify_all(); - if (!work_queue_.empty()) { - LOG(ERROR) << "UnboundedThreadPool named \"" << thread_name_ << "\" was " - << "deleted with pending work in its queue. This may indicate " - << "a potential use-after-free bug."; - } - } - - { - mutex_lock l(thread_pool_mu_); - // Clear the list of pooled threads, which will eventually terminate due to - // the previous notification. - // - // NOTE: It is safe to do this while holding `pooled_threads_mu_`, because - // no subsequent calls to `this->StartThread()` should be issued after the - // destructor starts. - thread_pool_.clear(); - } -} - std::shared_ptr UnboundedThreadPool::get_thread_factory() { return std::make_shared(this); } -size_t UnboundedThreadPool::size() { - tf_shared_lock l(thread_pool_mu_); - return thread_pool_.size(); +namespace { +void WorkQueueFunc(const std::function& fn, + std::shared_ptr notification) { + fn(); + notification->Notify(); } +} // namespace -std::unique_ptr UnboundedThreadPool::RunOnPooledThread( +std::unique_ptr UnboundedThreadPool::ScheduleOnWorkQueue( std::function fn) { auto join_notification = std::make_shared(); - bool all_threads_busy; - { - // Enqueue a work item for the new thread's function, and wake up a - // cached thread to process it. - mutex_lock l(work_queue_mu_); - work_queue_.push_back({std::move(fn), join_notification}); - work_queue_cv_.notify_one(); - // NOTE: The queue may be non-empty, so we must account for queued work when - // considering how many threads are free. - all_threads_busy = work_queue_.size() > num_idle_threads_; - } - - if (all_threads_busy) { - // Spawn a new physical thread to process the given function. - // NOTE: `PooledThreadFunc` will eventually increment `num_idle_threads_` - // at the beginning of its work loop. - Thread* new_thread = env_->StartThread( - {}, thread_name_, - std::bind(&UnboundedThreadPool::PooledThreadFunc, this)); - - mutex_lock l(thread_pool_mu_); - thread_pool_.emplace_back(new_thread); - } - + unbounded_work_queue_.Schedule( + std::bind(&WorkQueueFunc, std::move(fn), join_notification)); return absl::make_unique(std::move(join_notification)); } -void UnboundedThreadPool::PooledThreadFunc() { - while (true) { - WorkItem work_item; - { - mutex_lock l(work_queue_mu_); - ++num_idle_threads_; - while (!cancelled_ && work_queue_.empty()) { - // Wait for a new work function to be submitted, or the cache to be - // destroyed. - work_queue_cv_.wait(l); - } - if (cancelled_) { - return; - } - work_item = std::move(work_queue_.front()); - work_queue_.pop_front(); - --num_idle_threads_; - } - - work_item.work_function(); - - // Notify any thread that has "joined" the cached thread for this work item. - work_item.done_notification->Notify(); - } -} - } // namespace data } // namespace tensorflow diff --git a/tensorflow/core/kernels/data/unbounded_thread_pool.h b/tensorflow/core/kernels/data/unbounded_thread_pool.h index c84d495b296..90a54b9b19f 100644 --- a/tensorflow/core/kernels/data/unbounded_thread_pool.h +++ b/tensorflow/core/kernels/data/unbounded_thread_pool.h @@ -20,55 +20,33 @@ limitations under the License. #include #include "tensorflow/core/framework/thread_factory.h" -#include "tensorflow/core/lib/core/notification.h" #include "tensorflow/core/platform/env.h" -#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/unbounded_work_queue.h" namespace tensorflow { namespace data { // An `UnboundedThreadPool` provides a mechanism for temporally multiplexing a // potentially large number of "logical" threads onto a smaller number of -// "physical" threads. The multiplexing is achieved by maintaining an internal -// pool of long-running "physical" threads that are used to execute the -// "logical" threads. Like a regular thread, a "logical" thread may block on -// other threads, and the size of the pool will increase to ensure that progress -// is made. This mechanism is recommended in situations where short-lived -// threads are created repeatedly, to avoid the overhead and memory -// fragmentation that can result from excessive thread creation. +// "physical" threads. The multiplexing is achieved by using an +// `UnboundedWorkQueue`. class UnboundedThreadPool { public: UnboundedThreadPool(Env* env, const string& thread_name) - : env_(env), thread_name_(thread_name) {} - ~UnboundedThreadPool(); + : unbounded_work_queue_(env, thread_name) {} + ~UnboundedThreadPool() = default; // Returns an implementation of `ThreadFactory` that can be used to create // logical threads in this pool. std::shared_ptr get_thread_factory(); - // Returns the current number of threads in this pool. - size_t size(); - private: class LogicalThreadFactory; class LogicalThreadWrapper; - struct WorkItem { - std::function work_function; - std::shared_ptr done_notification; - }; - std::unique_ptr RunOnPooledThread(std::function fn); - void PooledThreadFunc(); + std::unique_ptr ScheduleOnWorkQueue(std::function fn); - Env* const env_; // Not owned. - const string thread_name_; - mutex work_queue_mu_; - condition_variable work_queue_cv_ GUARDED_BY(work_queue_mu_); - size_t num_idle_threads_ GUARDED_BY(work_queue_mu_) = 0; - bool cancelled_ GUARDED_BY(work_queue_mu_) = false; - std::deque work_queue_ GUARDED_BY(work_queue_mu_); - mutex thread_pool_mu_; - std::vector> thread_pool_ GUARDED_BY(thread_pool_mu_); + UnboundedWorkQueue unbounded_work_queue_; }; } // namespace data diff --git a/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc b/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc index f996b4f931b..3604be86473 100644 --- a/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc +++ b/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc @@ -23,59 +23,6 @@ namespace tensorflow { namespace data { namespace { -TEST(UnboundedThreadPool, SingleThread) { - UnboundedThreadPool pool(Env::Default(), "test"); - auto thread_factory = pool.get_thread_factory(); - - // Create a thread that updates a variable, and ensure that it runs to - // completion. - std::atomic i(0); - auto thread = thread_factory->StartThread("", [&i]() { ++i; }); - thread.reset(); - - EXPECT_GE(pool.size(), 1); - EXPECT_EQ(1, i); -} - -TEST(UnboundedThreadPool, MultipleThreads) { - UnboundedThreadPool pool(Env::Default(), "test"); - auto thread_factory = pool.get_thread_factory(); - - // Create ten threads that update a variable, and ensure that they all run - // to completion. - std::vector> threads; - const int kNumThreadsToCreate = 10; - std::atomic i(0); - for (int j = 0; j < kNumThreadsToCreate; ++j) { - threads.push_back(thread_factory->StartThread("", [&i]() { ++i; })); - } - threads.clear(); - - EXPECT_GE(pool.size(), 1); - EXPECT_EQ(i, kNumThreadsToCreate); -} - -TEST(UnboundedThreadPool, MultipleThreadsSleepingRandomly) { - UnboundedThreadPool pool(Env::Default(), "test"); - auto thread_factory = pool.get_thread_factory(); - - // Create 1000 threads that sleep for a random period of time then update a - // variable, and ensure that they all run to completion. - std::vector> threads; - const int kNumThreadsToCreate = 1000; - std::atomic i(0); - for (int j = 0; j < kNumThreadsToCreate; ++j) { - threads.push_back(thread_factory->StartThread("", [&i]() { - Env::Default()->SleepForMicroseconds(random::New64() % 10); - ++i; - })); - } - threads.clear(); - - EXPECT_GE(pool.size(), 1); - EXPECT_EQ(i, kNumThreadsToCreate); -} - TEST(UnboundedThreadPool, ConcurrentThreadCreation) { UnboundedThreadPool pool(Env::Default(), "test"); auto thread_factory = pool.get_thread_factory(); @@ -97,7 +44,6 @@ TEST(UnboundedThreadPool, ConcurrentThreadCreation) { } threads.clear(); - EXPECT_GE(pool.size(), 1); EXPECT_EQ(i, kNumThreadsToCreate * kNumThreadsToCreate); } @@ -108,9 +54,7 @@ TEST(UnboundedThreadPool, MultipleBlockingThreads) { std::vector> threads; // Create multiple waves (with increasing sizes) of threads that all block - // before returning, and - // ensure that we create the appropriate number of threads and terminate - // correctly. + // before returning, and ensure that we terminate correctly. std::vector round_sizes = {5, 10, 15, 20}; for (const int round_size : round_sizes) { @@ -129,10 +73,6 @@ TEST(UnboundedThreadPool, MultipleBlockingThreads) { // wave is increasing, we should have at least that number of threads in the // pool. bc.Wait(); - // NOTE: There is a benign race between a new round starting and the - // physical threads from the previous round returning to the pool, so we may - // create more threads than the round_size. - EXPECT_GE(pool.size(), round_size); n.Notify(); threads.clear(); } diff --git a/tensorflow/core/platform/default/unbounded_work_queue.cc b/tensorflow/core/platform/default/unbounded_work_queue.cc new file mode 100644 index 00000000000..249d6358643 --- /dev/null +++ b/tensorflow/core/platform/default/unbounded_work_queue.cc @@ -0,0 +1,101 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/platform/unbounded_work_queue.h" + +#include "absl/memory/memory.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/mutex.h" + +namespace tensorflow { + +UnboundedWorkQueue::UnboundedWorkQueue(Env* env, const string& thread_name) + : env_(env), thread_name_(thread_name) {} + +UnboundedWorkQueue::~UnboundedWorkQueue() { + { + mutex_lock l(work_queue_mu_); + // Wake up all `PooledThreadFunc` threads and cause them to terminate before + // joining them when `threads_` is cleared. + cancelled_ = true; + work_queue_cv_.notify_all(); + if (!work_queue_.empty()) { + LOG(ERROR) << "UnboundedWorkQueue named \"" << thread_name_ << "\" was " + << "deleted with pending work in its queue. This may indicate " + << "a potential use-after-free bug."; + } + } + + { + mutex_lock l(thread_pool_mu_); + // Clear the list of pooled threads, which will eventually terminate due to + // the previous notification. + // + // NOTE: It is safe to do this while holding `pooled_threads_mu_`, because + // no subsequent calls to `this->StartThread()` should be issued after the + // destructor starts. + thread_pool_.clear(); + } +} + +void UnboundedWorkQueue::Schedule(WorkFunction fn) { + bool all_threads_busy; + { + // Enqueue a work item for the new thread's function, and wake up a + // cached thread to process it. + mutex_lock l(work_queue_mu_); + work_queue_.push_back(std::move(fn)); + work_queue_cv_.notify_one(); + // NOTE: The queue may be non-empty, so we must account for queued work when + // considering how many threads are free. + all_threads_busy = work_queue_.size() > num_idle_threads_; + } + + if (all_threads_busy) { + // Spawn a new physical thread to process the given function. + // NOTE: `PooledThreadFunc` will eventually increment `num_idle_threads_` + // at the beginning of its work loop. + Thread* new_thread = + env_->StartThread({}, thread_name_, [this]() { PooledThreadFunc(); }); + + mutex_lock l(thread_pool_mu_); + thread_pool_.emplace_back(new_thread); + } +} + +void UnboundedWorkQueue::PooledThreadFunc() { + while (true) { + WorkFunction fn; + { + mutex_lock l(work_queue_mu_); + ++num_idle_threads_; + while (!cancelled_ && work_queue_.empty()) { + // Wait for a new work function to be submitted, or the cache to be + // destroyed. + work_queue_cv_.wait(l); + } + if (cancelled_) { + return; + } + fn = std::move(work_queue_.front()); + work_queue_.pop_front(); + --num_idle_threads_; + } + + fn(); + } +} + +} // namespace tensorflow diff --git a/tensorflow/core/platform/default/unbounded_work_queue.h b/tensorflow/core/platform/default/unbounded_work_queue.h new file mode 100644 index 00000000000..cba83622a3a --- /dev/null +++ b/tensorflow/core/platform/default/unbounded_work_queue.h @@ -0,0 +1,65 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_UNBOUNDED_WORK_QUEUE_H_ +#define TENSORFLOW_CORE_PLATFORM_DEFAULT_UNBOUNDED_WORK_QUEUE_H_ + +#include +#include +#include + +#include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/mutex.h" + +namespace tensorflow { + +// An `UnboundedWorkQueue` provides a mechanism for temporally multiplexing a +// potentially large number of "logical" threads onto a smaller number of +// "physical" threads. The multiplexing is achieved by maintaining an internal +// pool of long-running "physical" threads that are used to execute the +// "logical" threads. Like a regular thread, a "logical" thread may block on +// other threads, and the size of the pool will increase to ensure that progress +// is made. This mechanism is recommended in situations where short-lived +// threads are created repeatedly, to avoid the overhead and memory +// fragmentation that can result from excessive thread creation. +class UnboundedWorkQueue { + public: + UnboundedWorkQueue(Env* env, const string& thread_name); + ~UnboundedWorkQueue(); + + using WorkFunction = std::function; + + // Schedule `fn` on a thread. `fn` may perform blocking work, so if all the + // existing threads are blocked or busy, this may spawn a new thread which + // will be added to the thread pool managed by this work queue. + void Schedule(WorkFunction fn); + + private: + void PooledThreadFunc(); + + Env* const env_; // Not owned. + const string thread_name_; + mutex work_queue_mu_; + condition_variable work_queue_cv_ GUARDED_BY(work_queue_mu_); + size_t num_idle_threads_ GUARDED_BY(work_queue_mu_) = 0; + bool cancelled_ GUARDED_BY(work_queue_mu_) = false; + std::deque work_queue_ GUARDED_BY(work_queue_mu_); + mutex thread_pool_mu_; + std::vector> thread_pool_ GUARDED_BY(thread_pool_mu_); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_PLATFORM_DEFAULT_UNBOUNDED_WORK_QUEUE_H_ diff --git a/tensorflow/core/platform/unbounded_work_queue.h b/tensorflow/core/platform/unbounded_work_queue.h new file mode 100644 index 00000000000..242980dafa9 --- /dev/null +++ b/tensorflow/core/platform/unbounded_work_queue.h @@ -0,0 +1,33 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_PLATFORM_UNBOUNDED_WORK_QUEUE_H_ +#define TENSORFLOW_CORE_PLATFORM_UNBOUNDED_WORK_QUEUE_H_ + +#include "tensorflow/core/platform/platform.h" + +// An `UnboundedWorkQueue` feeds potentially-blocking work into a thread-pool +// whose size automatically increases with demand. + +#if defined(PLATFORM_GOOGLE) +#include "tensorflow/core/platform/google/unbounded_work_queue.h" +#elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \ + defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_WINDOWS) +#include "tensorflow/core/platform/default/unbounded_work_queue.h" +#else +#error Define the appropriate PLATFORM_ macro for this platform +#endif + +#endif // TENSORFLOW_CORE_PLATFORM_UNBOUNDED_WORK_QUEUE_H_ diff --git a/tensorflow/core/platform/unbounded_work_queue_test.cc b/tensorflow/core/platform/unbounded_work_queue_test.cc new file mode 100644 index 00000000000..03d91cd4893 --- /dev/null +++ b/tensorflow/core/platform/unbounded_work_queue_test.cc @@ -0,0 +1,104 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/platform/unbounded_work_queue.h" + +#include "absl/memory/memory.h" +#include "tensorflow/core/lib/core/blocking_counter.h" +#include "tensorflow/core/lib/random/random.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace { + +class UnboundedWorkQueueTest : public ::testing::Test { + protected: + UnboundedWorkQueueTest() + : work_queue_( + absl::make_unique(Env::Default(), "test")) {} + ~UnboundedWorkQueueTest() override = default; + + void RunMultipleCopiesOfClosure(const int num_closures, + std::function fn) { + for (int i = 0; i < num_closures; ++i) { + work_queue_->Schedule([this, fn]() { + fn(); + mutex_lock l(mu_); + ++closure_count_; + cond_var_.notify_all(); + }); + } + } + + void BlockUntilClosuresDone(const int num_closures) { + mutex_lock l(mu_); + while (closure_count_ < num_closures) { + cond_var_.wait(l); + } + } + + void ResetQueue() { work_queue_.reset(); } + + int NumClosuresExecuted() { + mutex_lock l(mu_); + return closure_count_; + } + + private: + mutex mu_; + int closure_count_ GUARDED_BY(mu_) = 0; + condition_variable cond_var_; + std::unique_ptr work_queue_; +}; + +TEST_F(UnboundedWorkQueueTest, SingleClosure) { + constexpr int num_closures = 1; + RunMultipleCopiesOfClosure(num_closures, []() {}); + BlockUntilClosuresDone(num_closures); +} + +TEST_F(UnboundedWorkQueueTest, MultipleClosures) { + constexpr int num_closures = 10; + RunMultipleCopiesOfClosure(num_closures, []() {}); + BlockUntilClosuresDone(num_closures); +} + +TEST_F(UnboundedWorkQueueTest, MultipleClosuresSleepingRandomly) { + constexpr int num_closures = 1000; + RunMultipleCopiesOfClosure(num_closures, []() { + Env::Default()->SleepForMicroseconds(random::New64() % 10); + }); + BlockUntilClosuresDone(num_closures); +} + +TEST_F(UnboundedWorkQueueTest, NestedClosures) { + constexpr int num_closures = 10; + // Run `num_closures` closures, each of which runs `num_closures` closures. + RunMultipleCopiesOfClosure(num_closures, [this]() { + RunMultipleCopiesOfClosure(num_closures, []() {}); + }); + BlockUntilClosuresDone(num_closures * num_closures + num_closures); +} + +TEST_F(UnboundedWorkQueueTest, RacyDestructor) { + constexpr int num_closures = 100; + // Run `num_closures` closures, then delete `work_queue_`. + RunMultipleCopiesOfClosure(num_closures, []() {}); + ResetQueue(); + EXPECT_LE(NumClosuresExecuted(), num_closures); +} + +} // namespace +} // namespace tensorflow From 2b5ece29d3f22b42645d5ca6ba0b2a8c575c4303 Mon Sep 17 00:00:00 2001 From: Frank Chen Date: Tue, 23 Jul 2019 22:21:29 -0700 Subject: [PATCH 0447/3053] Mechanical replacement of mirror.tensorflow.org with https equivalent. PiperOrigin-RevId: 259676414 --- WORKSPACE | 2 +- .../contrib/makefile/download_dependencies.sh | 10 +- .../tools/make/third_party_downloads.inc | 2 +- .../lite/tools/make/download_dependencies.sh | 8 +- tensorflow/workspace.bzl | 136 +++++++++--------- third_party/aws/workspace.bzl | 2 +- third_party/flatbuffers/workspace.bzl | 2 +- third_party/highwayhash/workspace.bzl | 2 +- third_party/hwloc/workspace.bzl | 2 +- third_party/icu/workspace.bzl | 2 +- third_party/jpeg/workspace.bzl | 2 +- .../keras_applications_archive/workspace.bzl | 2 +- third_party/kissfft/workspace.bzl | 2 +- third_party/mlir/mlir_configure.bzl | 2 +- third_party/nasm/workspace.bzl | 2 +- third_party/ortools/workspace.bzl | 2 +- third_party/pasta/workspace.bzl | 2 +- .../preconfig/generate/archives.bzl | 2 +- 18 files changed, 92 insertions(+), 92 deletions(-) diff --git a/WORKSPACE b/WORKSPACE index 43312f350d6..d5bd495ec4d 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -7,7 +7,7 @@ http_archive( sha256 = "5b00383d08dd71f28503736db0500b6fb4dda47489ff5fc6bed42557c07c6ba9", strip_prefix = "rules_closure-308b05b2419edb5c8ee0471b67a40403df940149", urls = [ - "http://mirror.tensorflow.org/github.com/bazelbuild/rules_closure/archive/308b05b2419edb5c8ee0471b67a40403df940149.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_closure/archive/308b05b2419edb5c8ee0471b67a40403df940149.tar.gz", "https://github.com/bazelbuild/rules_closure/archive/308b05b2419edb5c8ee0471b67a40403df940149.tar.gz", # 2019-06-13 ], ) diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh index 1feca44f6e5..efa122b34d8 100755 --- a/tensorflow/contrib/makefile/download_dependencies.sh +++ b/tensorflow/contrib/makefile/download_dependencies.sh @@ -27,9 +27,9 @@ if [ ! -f $BZL_FILE_PATH ]; then fi EIGEN_URL="$(grep -o 'https://bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)" -GEMMLOWP_URL="$(grep -o 'http://mirror.tensorflow.org/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)" +GEMMLOWP_URL="$(grep -o 'https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)" GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz" -NSYNC_URL="$(grep -o 'http://mirror.tensorflow.org/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" +NSYNC_URL="$(grep -o 'https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" # Note: The protobuf repo needs to be cloned due to its submodules. # These variables contain the GitHub repo and the sha, from `tensorflow/workspace.bzl`, @@ -37,7 +37,7 @@ NSYNC_URL="$(grep -o 'http://mirror.tensorflow.org/github.com/google/nsync/.*tar readonly PROTOBUF_REPO="https://github.com/protocolbuffers/protobuf.git" readonly PROTOBUF_TAG="$(grep -o 'https://github.com/protocolbuffers/protobuf/archive/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1 | awk '{print substr($0, index($0, "archive") + 8, index($0, "tar") - index($0, "archive") - 9) }')" -# TODO (yongtang): Replace the following with 'http://mirror.tensorflow.org/github.com/google/re2/.*tar\.gz' once +# TODO (yongtang): Replace the following with 'https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/re2/.*tar\.gz' once # the archive has been propagated in mirror.tensorflow.org. RE2_URL="$(grep -o 'https://github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)" FFT2D_URL="$(grep -o 'http.*fft2d\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)" @@ -46,8 +46,8 @@ ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_ CUB_URL="$(grep -o 'https.*cub/archive.*zip' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)" # Required for TensorFlow Lite Flex runtime. -FARMHASH_URL="http://mirror.tensorflow.org/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz" -FLATBUFFERS_URL="http://mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz" +FARMHASH_URL="https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz" +FLATBUFFERS_URL="https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz" # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64, # so work around it by patching the source. diff --git a/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc b/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc index 1d0164b718c..42ecf3f965d 100644 --- a/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc +++ b/tensorflow/lite/experimental/micro/tools/make/third_party_downloads.inc @@ -3,7 +3,7 @@ GEMMLOWP_URL := "https://github.com/google/gemmlowp/archive/719139ce755a0f31cbf1c37f7f98adcc7fc9f425.zip" GEMMLOWP_MD5 := "7e8191b24853d75de2af87622ad293ba" -FLATBUFFERS_URL := "http://mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz" +FLATBUFFERS_URL := "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz" FLATBUFFERS_MD5 := "02c64880acb89dbd57eebacfd67200d8" ifeq ($(HOST_OS),osx) diff --git a/tensorflow/lite/tools/make/download_dependencies.sh b/tensorflow/lite/tools/make/download_dependencies.sh index 1b0df57624f..4b4df1e9f9d 100755 --- a/tensorflow/lite/tools/make/download_dependencies.sh +++ b/tensorflow/lite/tools/make/download_dependencies.sh @@ -30,13 +30,13 @@ if [ ! -f $BZL_FILE_PATH ]; then fi EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.tensorflow | head -n1)" -GEMMLOWP_URL="$(grep -o 'http://mirror.tensorflow.org/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)" +GEMMLOWP_URL="$(grep -o 'https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)" GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz" ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)" NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip" -FARMHASH_URL="http://mirror.tensorflow.org/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz" -FLATBUFFERS_URL="http://mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz" -FFT2D_URL="http://mirror.tensorflow.org/www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz" +FARMHASH_URL="https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz" +FLATBUFFERS_URL="https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz" +FFT2D_URL="https://storage.googleapis.com/mirror.tensorflow.org/www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz" # TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64, # so work around it by patching the source. diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 8b7c32844b3..f888e2d8b83 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -96,7 +96,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "a936d6b277a33d2a027a024ea8e65df62bd2e162c7ca52c48486ed9d5dc27160", strip_prefix = "mklml_lnx_2019.0.5.20190502", urls = [ - "http://mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_lnx_2019.0.5.20190502.tgz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_lnx_2019.0.5.20190502.tgz", "https://github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_lnx_2019.0.5.20190502.tgz", ], ) @@ -106,7 +106,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "535857b17643d7f7546b58fc621244e7cfcc4fff2aa2ebd3fc5b4e126bfc36cf", strip_prefix = "mklml_win_2019.0.5.20190502", urls = [ - "http://mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_win_2019.0.5.20190502.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_win_2019.0.5.20190502.zip", "https://github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_win_2019.0.5.20190502.zip", ], ) @@ -116,7 +116,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "2fbb71a0365d42a39ea7906568d69b1db3bfc9914fee75eedb06c5f32bf5fa68", strip_prefix = "mklml_mac_2019.0.5.20190502", urls = [ - "http://mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_mac_2019.0.5.20190502.tgz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_mac_2019.0.5.20190502.tgz", "https://github.com/intel/mkl-dnn/releases/download/v0.20-rc/mklml_mac_2019.0.5.20190502.tgz", ], ) @@ -136,7 +136,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "26f720ed912843ba293e8a1e0822fe5318e93c529d80c87af1cf555d68e642d0", strip_prefix = "mkl-dnn-0.20.1", urls = [ - "http://mirror.tensorflow.org/github.com/intel/mkl-dnn/archive/v0.20.1.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/archive/v0.20.1.tar.gz", "https://github.com/intel/mkl-dnn/archive/v0.20.1.tar.gz", ], ) @@ -147,7 +147,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "fcc2d951f7170eade0cfdd0d8d1d58e3e7785bd326bca6555f3722f8cba71811", strip_prefix = "mkl-dnn-1.0-pc2", urls = [ - "http://mirror.tensorflow.org/github.com/intel/mkl-dnn/archive/v1.0-pc2.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/archive/v1.0-pc2.tar.gz", "https://github.com/intel/mkl-dnn/archive/v1.0-pc2.tar.gz", ], ) @@ -158,7 +158,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "eee7452846aae8040037234accf9a1cfbeca1d93bb4238b70f0d43d26645a602", strip_prefix = "abseil-cpp-f3840bc5e33ce4932e35986cf3718450c6f02af2", urls = [ - "http://mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/f3840bc5e33ce4932e35986cf3718450c6f02af2.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/f3840bc5e33ce4932e35986cf3718450c6f02af2.tar.gz", "https://github.com/abseil/abseil-cpp/archive/f3840bc5e33ce4932e35986cf3718450c6f02af2.tar.gz", ], ) @@ -170,7 +170,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "f3d69ac773ecaf3602cb940040390d4e71a501bb145ca9e01ce5464cf6d4eb68", strip_prefix = "eigen-eigen-049af2f56331", urls = [ - "http://mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/049af2f56331.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/049af2f56331.tar.gz", "https://bitbucket.org/eigen/eigen/get/049af2f56331.tar.gz", ], ) @@ -181,7 +181,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "4c622a5c7b9feb9615d4723b03a13142a7f3f813f9296861d5401282b9fbea96", strip_prefix = "tools-0e906ebc527eab1cdbf7adabff5b474da9562e9f/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf", urls = [ - "http://mirror.tensorflow.org/github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz", "https://github.com/raspberrypi/tools/archive/0e906ebc527eab1cdbf7adabff5b474da9562e9f.tar.gz", ], ) @@ -192,7 +192,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "5fc1972471cd8e2b8b64ea017590193739fc88d9818e3d086621e5c08e86ea35", strip_prefix = "libxsmm-1.11", urls = [ - "http://mirror.tensorflow.org/github.com/hfp/libxsmm/archive/1.11.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/hfp/libxsmm/archive/1.11.tar.gz", "https://github.com/hfp/libxsmm/archive/1.11.tar.gz", ], ) @@ -203,7 +203,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "re2-506cfa4bffd060c06ec338ce50ea3468daa6c814", system_build_file = clean_dep("//third_party/systemlibs:re2.BUILD"), urls = [ - "http://mirror.tensorflow.org/github.com/google/re2/archive/506cfa4bffd060c06ec338ce50ea3468daa6c814.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/re2/archive/506cfa4bffd060c06ec338ce50ea3468daa6c814.tar.gz", "https://github.com/google/re2/archive/506cfa4bffd060c06ec338ce50ea3468daa6c814.tar.gz", ], ) @@ -217,7 +217,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD", }, urls = [ - "http://mirror.tensorflow.org/github.com/googleapis/google-cloud-cpp/archive/v0.10.0.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/googleapis/google-cloud-cpp/archive/v0.10.0.tar.gz", "https://github.com/googleapis/google-cloud-cpp/archive/v0.10.0.tar.gz", ], ) @@ -229,7 +229,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "googleapis-f81082ea1e2f85c43649bee26e0d9871d4b41cdb", system_build_file = clean_dep("//third_party/systemlibs:googleapis.BUILD"), urls = [ - "http://mirror.tensorflow.org/github.com/googleapis/googleapis/archive/f81082ea1e2f85c43649bee26e0d9871d4b41cdb.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/googleapis/googleapis/archive/f81082ea1e2f85c43649bee26e0d9871d4b41cdb.zip", "https://github.com/googleapis/googleapis/archive/f81082ea1e2f85c43649bee26e0d9871d4b41cdb.zip", ], ) @@ -239,7 +239,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "6678b484d929f2d0d3229d8ac4e3b815a950c86bb9f17851471d143f6d4f7834", strip_prefix = "gemmlowp-12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3", urls = [ - "http://mirror.tensorflow.org/github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip", "https://github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip", ], ) @@ -250,7 +250,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "6560547c63e4af82b0f202cb710ceabb3f21347a4b996db565a411da5b17aba0", strip_prefix = "farmhash-816a4ae622e964763ca0862d9dbd19324a1eaf45", urls = [ - "http://mirror.tensorflow.org/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz", "https://github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz", ], ) @@ -263,7 +263,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "libpng-1.6.37", system_build_file = clean_dep("//third_party/systemlibs:png.BUILD"), urls = [ - "http://mirror.tensorflow.org/github.com/glennrp/libpng/archive/v1.6.37.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/glennrp/libpng/archive/v1.6.37.tar.gz", "https://github.com/glennrp/libpng/archive/v1.6.37.tar.gz", ], ) @@ -275,7 +275,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "sqlite-amalgamation-3280000", system_build_file = clean_dep("//third_party/systemlibs:sqlite.BUILD"), urls = [ - "http://mirror.tensorflow.org/www.sqlite.org/2019/sqlite-amalgamation-3280000.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/www.sqlite.org/2019/sqlite-amalgamation-3280000.zip", "https://www.sqlite.org/2019/sqlite-amalgamation-3280000.zip", ], ) @@ -287,7 +287,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "giflib-5.1.4", system_build_file = clean_dep("//third_party/systemlibs:gif.BUILD"), urls = [ - "http://mirror.tensorflow.org/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/ufpr.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz", "http://pilotfiber.dl.sourceforge.net/project/giflib/giflib-5.1.4.tar.gz", ], ) @@ -299,7 +299,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "six-1.10.0", system_build_file = clean_dep("//third_party/systemlibs:six.BUILD"), urls = [ - "http://mirror.tensorflow.org/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz", "https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz", ], ) @@ -311,7 +311,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "astor-0.7.1", system_build_file = clean_dep("//third_party/systemlibs:astor.BUILD"), urls = [ - "http://mirror.tensorflow.org/pypi.python.org/packages/99/80/f9482277c919d28bebd85813c0a70117214149a96b08981b72b63240b84c/astor-0.7.1.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/99/80/f9482277c919d28bebd85813c0a70117214149a96b08981b72b63240b84c/astor-0.7.1.tar.gz", "https://pypi.python.org/packages/99/80/f9482277c919d28bebd85813c0a70117214149a96b08981b72b63240b84c/astor-0.7.1.tar.gz", ], ) @@ -322,7 +322,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "f6253dfbe0538ad2e387bd8fdfd9293c925d63553f5813c4e587745416501e6d", strip_prefix = "functools32-3.2.3-2", urls = [ - "http://mirror.tensorflow.org/pypi.python.org/packages/c5/60/6ac26ad05857c601308d8fb9e87fa36d0ebf889423f47c3502ef034365db/functools32-3.2.3-2.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/c5/60/6ac26ad05857c601308d8fb9e87fa36d0ebf889423f47c3502ef034365db/functools32-3.2.3-2.tar.gz", "https://pypi.python.org/packages/c5/60/6ac26ad05857c601308d8fb9e87fa36d0ebf889423f47c3502ef034365db/functools32-3.2.3-2.tar.gz", ], ) @@ -334,7 +334,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "gast-0.2.2", system_build_file = clean_dep("//third_party/systemlibs:gast.BUILD"), urls = [ - "http://mirror.tensorflow.org/pypi.python.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2.2.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2.2.tar.gz", "https://files.pythonhosted.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2.2.tar.gz", ], ) @@ -346,7 +346,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "termcolor-1.1.0", system_build_file = clean_dep("//third_party/systemlibs:termcolor.BUILD"), urls = [ - "http://mirror.tensorflow.org/pypi.python.org/packages/8a/48/a76be51647d0eb9f10e2a4511bf3ffb8cc1e6b14e9e4fab46173aa79f981/termcolor-1.1.0.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/8a/48/a76be51647d0eb9f10e2a4511bf3ffb8cc1e6b14e9e4fab46173aa79f981/termcolor-1.1.0.tar.gz", "https://pypi.python.org/packages/8a/48/a76be51647d0eb9f10e2a4511bf3ffb8cc1e6b14e9e4fab46173aa79f981/termcolor-1.1.0.tar.gz", ], ) @@ -358,7 +358,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "opt_einsum-2.3.2", system_build_file = clean_dep("//third_party/systemlibs:opt_einsum.BUILD"), urls = [ - "http://mirror.tensorflow.org/pypi.python.org/packages/f6/d6/44792ec668bcda7d91913c75237314e688f70415ab2acd7172c845f0b24f/opt_einsum-2.3.2.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/f6/d6/44792ec668bcda7d91913c75237314e688f70415ab2acd7172c845f0b24f/opt_einsum-2.3.2.tar.gz", "https://pypi.python.org/packages/f6/d6/44792ec668bcda7d91913c75237314e688f70415ab2acd7172c845f0b24f/opt_einsum-2.3.2.tar.gz", ], ) @@ -374,7 +374,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): "//third_party/systemlibs:absl_py.absl.testing.BUILD": "absl/testing/BUILD", }, urls = [ - "http://mirror.tensorflow.org/github.com/abseil/abseil-py/archive/pypi-v0.7.1.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-py/archive/pypi-v0.7.1.tar.gz", "https://github.com/abseil/abseil-py/archive/pypi-v0.7.1.tar.gz", ], ) @@ -382,7 +382,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "enum34_archive", urls = [ - "http://mirror.tensorflow.org/pypi.python.org/packages/bf/3e/31d502c25302814a7c2f1d3959d2a3b3f78e509002ba91aea64993936876/enum34-1.1.6.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/bf/3e/31d502c25302814a7c2f1d3959d2a3b3f78e509002ba91aea64993936876/enum34-1.1.6.tar.gz", "https://pypi.python.org/packages/bf/3e/31d502c25302814a7c2f1d3959d2a3b3f78e509002ba91aea64993936876/enum34-1.1.6.tar.gz", ], sha256 = "8ad8c4783bf61ded74527bffb48ed9b54166685e4230386a9ed9b1279e2df5b1", @@ -396,7 +396,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "8813bf712a66b3d8b85dc289e1104ed220f1878cf981e2fe756dfaabe9a82892", strip_prefix = "backports.weakref-1.0rc1/src", urls = [ - "http://mirror.tensorflow.org/pypi.python.org/packages/bc/cc/3cdb0a02e7e96f6c70bd971bc8a90b8463fda83e264fa9c5c1c98ceabd81/backports.weakref-1.0rc1.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/pypi.python.org/packages/bc/cc/3cdb0a02e7e96f6c70bd971bc8a90b8463fda83e264fa9c5c1c98ceabd81/backports.weakref-1.0rc1.tar.gz", "https://pypi.python.org/packages/bc/cc/3cdb0a02e7e96f6c70bd971bc8a90b8463fda83e264fa9c5c1c98ceabd81/backports.weakref-1.0rc1.tar.gz", ], ) @@ -406,7 +406,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): licenses = ["notice"], # Python 2.0 sha256_urls = { "e76cacdf0bdd265ff074ccca03671c33126f597f39d0ed97bc3e5673d9170cf6": [ - "http://mirror.tensorflow.org/docs.python.org/2.7/_sources/license.rst.txt", + "https://storage.googleapis.com/mirror.tensorflow.org/docs.python.org/2.7/_sources/license.rst.txt", "https://docs.python.org/2.7/_sources/license.rst.txt", ], }, @@ -414,7 +414,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): # 310ba5ee72661c081129eb878c1bbcec936b20f0 is based on 3.8.0 with a fix for protobuf.bzl. PROTOBUF_URLS = [ - "http://mirror.tensorflow.org/github.com/protocolbuffers/protobuf/archive/310ba5ee72661c081129eb878c1bbcec936b20f0.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/protocolbuffers/protobuf/archive/310ba5ee72661c081129eb878c1bbcec936b20f0.tar.gz", "https://github.com/protocolbuffers/protobuf/archive/310ba5ee72661c081129eb878c1bbcec936b20f0.tar.gz", ] PROTOBUF_SHA256 = "b9e92f9af8819bbbc514e2902aec860415b70209f31dfc8c4fa72515a5df9d59" @@ -442,7 +442,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "nsync-1.20.2", system_build_file = clean_dep("//third_party/systemlibs:nsync.BUILD"), urls = [ - "http://mirror.tensorflow.org/github.com/google/nsync/archive/1.20.2.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/nsync/archive/1.20.2.tar.gz", "https://github.com/google/nsync/archive/1.20.2.tar.gz", ], ) @@ -452,7 +452,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "ff7a82736e158c077e76188232eac77913a15dac0b22508c390ab3f88e6d6d86", strip_prefix = "googletest-b6cd405286ed8635ece71c72f118e659f4ade3fb", urls = [ - "http://mirror.tensorflow.org/github.com/google/googletest/archive/b6cd405286ed8635ece71c72f118e659f4ade3fb.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/googletest/archive/b6cd405286ed8635ece71c72f118e659f4ade3fb.zip", "https://github.com/google/googletest/archive/b6cd405286ed8635ece71c72f118e659f4ade3fb.zip", ], ) @@ -462,7 +462,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "ae27cdbcd6a2f935baa78e4f21f675649271634c092b1be01469440495609d0e", strip_prefix = "gflags-2.2.1", urls = [ - "http://mirror.tensorflow.org/github.com/gflags/gflags/archive/v2.2.1.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/gflags/gflags/archive/v2.2.1.tar.gz", "https://github.com/gflags/gflags/archive/v2.2.1.tar.gz", ], ) @@ -474,7 +474,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "pcre-8.42", system_build_file = clean_dep("//third_party/systemlibs:pcre.BUILD"), urls = [ - "http://mirror.tensorflow.org/ftp.exim.org/pub/pcre/pcre-8.42.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/ftp.exim.org/pub/pcre/pcre-8.42.tar.gz", "http://ftp.exim.org/pub/pcre/pcre-8.42.tar.gz", ], ) @@ -486,7 +486,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "swig-3.0.8", system_build_file = clean_dep("//third_party/systemlibs:swig.BUILD"), urls = [ - "http://mirror.tensorflow.org/ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz", "http://ufpr.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz", "http://pilotfiber.dl.sourceforge.net/project/swig/swig/swig-3.0.8/swig-3.0.8.tar.gz", ], @@ -499,7 +499,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "curl-7.60.0", system_build_file = clean_dep("//third_party/systemlibs:curl.BUILD"), urls = [ - "http://mirror.tensorflow.org/curl.haxx.se/download/curl-7.60.0.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/curl.haxx.se/download/curl-7.60.0.tar.gz", "https://curl.haxx.se/download/curl-7.60.0.tar.gz", ], ) @@ -511,7 +511,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "grpc-4566c2a29ebec0835643b972eb99f4306c4234a3", system_build_file = clean_dep("//third_party/systemlibs:grpc.BUILD"), urls = [ - "http://mirror.tensorflow.org/github.com/grpc/grpc/archive/4566c2a29ebec0835643b972eb99f4306c4234a3.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/grpc/grpc/archive/4566c2a29ebec0835643b972eb99f4306c4234a3.tar.gz", "https://github.com/grpc/grpc/archive/4566c2a29ebec0835643b972eb99f4306c4234a3.tar.gz", ], ) @@ -522,7 +522,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): build_file = "@grpc//third_party:nanopb.BUILD", strip_prefix = "nanopb-f8ac463766281625ad710900479130c7fcb4d63b", urls = [ - "http://mirror.tensorflow.org/github.com/nanopb/nanopb/archive/f8ac463766281625ad710900479130c7fcb4d63b.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nanopb/nanopb/archive/f8ac463766281625ad710900479130c7fcb4d63b.tar.gz", "https://github.com/nanopb/nanopb/archive/f8ac463766281625ad710900479130c7fcb4d63b.tar.gz", ], ) @@ -533,7 +533,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "7f51f45887a3d31b4ce4fa5965210a5e64637ceac12720cfce7954d6a2e812f7", strip_prefix = "linenoise-c894b9e59f02203dbe4e2be657572cf88c4230c3", urls = [ - "http://mirror.tensorflow.org/github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz", "https://github.com/antirez/linenoise/archive/c894b9e59f02203dbe4e2be657572cf88c4230c3.tar.gz", ], ) @@ -558,7 +558,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "lmdb-LMDB_0.9.22/libraries/liblmdb", system_build_file = clean_dep("//third_party/systemlibs:lmdb.BUILD"), urls = [ - "http://mirror.tensorflow.org/github.com/LMDB/lmdb/archive/LMDB_0.9.22.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/LMDB/lmdb/archive/LMDB_0.9.22.tar.gz", "https://github.com/LMDB/lmdb/archive/LMDB_0.9.22.tar.gz", ], ) @@ -570,7 +570,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "jsoncpp-1.8.4", system_build_file = clean_dep("//third_party/systemlibs:jsoncpp.BUILD"), urls = [ - "http://mirror.tensorflow.org/github.com/open-source-parsers/jsoncpp/archive/1.8.4.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/open-source-parsers/jsoncpp/archive/1.8.4.tar.gz", "https://github.com/open-source-parsers/jsoncpp/archive/1.8.4.tar.gz", ], ) @@ -581,7 +581,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "boringssl-7f634429a04abc48e2eb041c81c5235816c96514", system_build_file = clean_dep("//third_party/systemlibs:boringssl.BUILD"), urls = [ - "http://mirror.tensorflow.org/github.com/google/boringssl/archive/7f634429a04abc48e2eb041c81c5235816c96514.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/boringssl/archive/7f634429a04abc48e2eb041c81c5235816c96514.tar.gz", "https://github.com/google/boringssl/archive/7f634429a04abc48e2eb041c81c5235816c96514.tar.gz", ], ) @@ -593,7 +593,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "zlib-1.2.11", system_build_file = clean_dep("//third_party/systemlibs:zlib.BUILD"), urls = [ - "http://mirror.tensorflow.org/zlib.net/zlib-1.2.11.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/zlib.net/zlib-1.2.11.tar.gz", "https://zlib.net/zlib-1.2.11.tar.gz", ], ) @@ -603,7 +603,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): build_file = clean_dep("//third_party/fft2d:fft2d.BUILD"), sha256 = "ada7e99087c4ed477bfdf11413f2ba8db8a840ba9bbf8ac94f4f3972e2a7cec9", urls = [ - "http://mirror.tensorflow.org/www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz", + "https://storage.googleapis.com/mirror.tensorflow.org/www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz", "http://www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz", ], ) @@ -615,7 +615,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "snappy-1.1.7", system_build_file = clean_dep("//third_party/systemlibs:snappy.BUILD"), urls = [ - "http://mirror.tensorflow.org/github.com/google/snappy/archive/1.1.7.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/snappy/archive/1.1.7.tar.gz", "https://github.com/google/snappy/archive/1.1.7.tar.gz", ], ) @@ -627,7 +627,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "9a7633e224982e2b60fa6b397d895d20d6b7498e3e02f46f98a5a4e187c5a44c", strip_prefix = "nccl-0ceaec9cee96ae7658aa45686853286651f36384", urls = [ - "http://mirror.tensorflow.org/github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz", "https://github.com/nvidia/nccl/archive/0ceaec9cee96ae7658aa45686853286651f36384.tar.gz", ], ) @@ -639,7 +639,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "cc6ebbcd0a826eec1b8ce1f625ffe71b53ef3290f8192b6cae38412a958f4fd3", strip_prefix = "librdkafka-0.11.5", urls = [ - "http://mirror.tensorflow.org/github.com/edenhill/librdkafka/archive/v0.11.5.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/edenhill/librdkafka/archive/v0.11.5.tar.gz", "https://github.com/edenhill/librdkafka/archive/v0.11.5.tar.gz", ], ) @@ -648,7 +648,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): name = "junit", jar_sha256 = "59721f0805e223d84b90677887d9ff567dc534d7c502ca903c0c2b17f05c116a", jar_urls = [ - "http://mirror.tensorflow.org/repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar", + "https://storage.googleapis.com/mirror.tensorflow.org/repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar", "http://repo1.maven.org/maven2/junit/junit/4.12/junit-4.12.jar", "http://maven.ibiblio.org/maven2/junit/junit/4.12/junit-4.12.jar", ], @@ -661,7 +661,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): name = "org_hamcrest_core", jar_sha256 = "66fdef91e9739348df7a096aa384a5685f4e875584cce89386a7a47251c4d8e9", jar_urls = [ - "http://mirror.tensorflow.org/repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar", + "https://storage.googleapis.com/mirror.tensorflow.org/repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar", "http://repo1.maven.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar", "http://maven.ibiblio.org/maven2/org/hamcrest/hamcrest-core/1.3/hamcrest-core-1.3.jar", ], @@ -673,7 +673,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): name = "com_google_testing_compile", jar_sha256 = "edc180fdcd9f740240da1a7a45673f46f59c5578d8cd3fbc912161f74b5aebb8", jar_urls = [ - "http://mirror.tensorflow.org/repo1.maven.org/maven2/com/google/testing/compile/compile-testing/0.11/compile-testing-0.11.jar", + "https://storage.googleapis.com/mirror.tensorflow.org/repo1.maven.org/maven2/com/google/testing/compile/compile-testing/0.11/compile-testing-0.11.jar", "http://repo1.maven.org/maven2/com/google/testing/compile/compile-testing/0.11/compile-testing-0.11.jar", ], licenses = ["notice"], # New BSD License @@ -685,7 +685,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): name = "com_google_truth", jar_sha256 = "032eddc69652b0a1f8d458f999b4a9534965c646b8b5de0eba48ee69407051df", jar_urls = [ - "http://mirror.tensorflow.org/repo1.maven.org/maven2/com/google/truth/truth/0.32/truth-0.32.jar", + "https://storage.googleapis.com/mirror.tensorflow.org/repo1.maven.org/maven2/com/google/truth/truth/0.32/truth-0.32.jar", "http://repo1.maven.org/maven2/com/google/truth/truth/0.32/truth-0.32.jar", ], licenses = ["notice"], # Apache 2.0 @@ -697,7 +697,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): name = "org_checkerframework_qual", jar_sha256 = "a17501717ef7c8dda4dba73ded50c0d7cde440fd721acfeacbf19786ceac1ed6", jar_urls = [ - "http://mirror.tensorflow.org/repo1.maven.org/maven2/org/checkerframework/checker-qual/2.4.0/checker-qual-2.4.0.jar", + "https://storage.googleapis.com/mirror.tensorflow.org/repo1.maven.org/maven2/org/checkerframework/checker-qual/2.4.0/checker-qual-2.4.0.jar", "http://repo1.maven.org/maven2/org/checkerframework/checker-qual/2.4.0/checker-qual-2.4.0.jar", ], licenses = ["notice"], # Apache 2.0 @@ -707,7 +707,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): name = "com_squareup_javapoet", jar_sha256 = "5bb5abdfe4366c15c0da3332c57d484e238bd48260d6f9d6acf2b08fdde1efea", jar_urls = [ - "http://mirror.tensorflow.org/repo1.maven.org/maven2/com/squareup/javapoet/1.9.0/javapoet-1.9.0.jar", + "https://storage.googleapis.com/mirror.tensorflow.org/repo1.maven.org/maven2/com/squareup/javapoet/1.9.0/javapoet-1.9.0.jar", "http://repo1.maven.org/maven2/com/squareup/javapoet/1.9.0/javapoet-1.9.0.jar", ], licenses = ["notice"], # Apache 2.0 @@ -719,7 +719,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "e0928ca4aa10ea1e0551e2d7ce4d1d7ea2d84b2abbdef082b0da84268791d0c4", strip_prefix = "pprof-c0fb62ec88c411cc91194465e54db2632845b650", urls = [ - "http://mirror.tensorflow.org/github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz", "https://github.com/google/pprof/archive/c0fb62ec88c411cc91194465e54db2632845b650.tar.gz", ], ) @@ -730,7 +730,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "6bfa06ab52a650ae7ee6963143a0bbc667d6504822cbd9670369b598f18c58c3", strip_prefix = "cub-1.8.0", urls = [ - "http://mirror.tensorflow.org/github.com/NVlabs/cub/archive/1.8.0.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NVlabs/cub/archive/1.8.0.zip", "https://github.com/NVlabs/cub/archive/1.8.0.zip", ], ) @@ -754,7 +754,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "cython-0.28.4", system_build_file = clean_dep("//third_party/systemlibs:cython.BUILD"), urls = [ - "http://mirror.tensorflow.org/github.com/cython/cython/archive/0.28.4.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/cython/cython/archive/0.28.4.tar.gz", "https://github.com/cython/cython/archive/0.28.4.tar.gz", ], ) @@ -765,7 +765,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "213733991310b904b11b053ac224fee2d4e0179e46b52fe7f8735b8831e04dcc", strip_prefix = "ARM_NEON_2_x86_SSE-1200fe90bb174a6224a525ee60148671a786a71f", urls = [ - "http://mirror.tensorflow.org/github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz", "https://github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz", ], ) @@ -777,7 +777,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "double-conversion-3992066a95b823efc8ccc1baf82a1cfc73f6e9b8", system_build_file = clean_dep("//third_party/systemlibs:double_conversion.BUILD"), urls = [ - "http://mirror.tensorflow.org/github.com/google/double-conversion/archive/3992066a95b823efc8ccc1baf82a1cfc73f6e9b8.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/double-conversion/archive/3992066a95b823efc8ccc1baf82a1cfc73f6e9b8.zip", "https://github.com/google/double-conversion/archive/3992066a95b823efc8ccc1baf82a1cfc73f6e9b8.zip", ], ) @@ -807,7 +807,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): build_file = str(Label("//third_party:tflite_mobilenet.BUILD")), sha256 = "767057f2837a46d97882734b03428e8dd640b93236052b312b2f0e45613c1cf0", urls = [ - "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip", "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip", ], ) @@ -817,7 +817,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): build_file = str(Label("//third_party:tflite_mobilenet.BUILD")), sha256 = "a809cd290b4d6a2e8a9d5dad076e0bd695b8091974e0eed1052b480b2f21b6dc", urls = [ - "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_0.75_quant_2018_06_29.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_0.75_quant_2018_06_29.zip", "https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_0.75_quant_2018_06_29.zip", ], ) @@ -828,7 +828,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "09280972c5777f1aa775ef67cb4ac5d5ed21970acd8535aeca62450ef14f0d79", strip_prefix = "ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18", urls = [ - "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18.tar.gz", "http://storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_quantized_300x300_coco14_sync_2018_07_18.tar.gz", ], ) @@ -838,7 +838,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): build_file = str(Label("//third_party:tflite_mobilenet.BUILD")), sha256 = "d947b38cba389b5e2d0bfc3ea6cc49c784e187b41a071387b3742d1acac7691e", urls = [ - "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip", "https://storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip", ], ) @@ -848,7 +848,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): build_file = clean_dep("//third_party:tflite_smartreply.BUILD"), sha256 = "8980151b85a87a9c1a3bb1ed4748119e4a85abd3cb5744d83da4d4bd0fbeef7c", urls = [ - "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip", "https://storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip", ], ) @@ -859,7 +859,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "033c941b7829b05ca55a124a26a6a0581b1ececc154a2153cafcfdb54f80dca2", strip_prefix = "ovic", urls = [ - "http://mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/data/ovic_2019_04_30.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/storage.googleapis.com/download.tensorflow.org/data/ovic_2019_04_30.zip", "https://storage.googleapis.com/download.tensorflow.org/data/ovic_2019_04_30.zip", ], ) @@ -869,7 +869,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "cd06d15dd8bb59926e4d65f9003bfc20f9da4b2519985c27e190cddc8b7a7806", strip_prefix = "rules_android-0.1.1", urls = [ - "http://mirror.tensorflow.org/github.com/bazelbuild/rules_android/archive/v0.1.1.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_android/archive/v0.1.1.zip", "https://github.com/bazelbuild/rules_android/archive/v0.1.1.zip", ], ) @@ -880,7 +880,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "c3245012296f09f1418b78a8c2f17df5188b3bd0db620f7fd5fabe363320805a", strip_prefix = "tbb-2019_U1", urls = [ - "http://mirror.tensorflow.org/github.com/01org/tbb/archive/2019_U1.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/01org/tbb/archive/2019_U1.zip", "https://github.com/01org/tbb/archive/2019_U1.zip", ], ) @@ -891,7 +891,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "a1780f24a1381fc25e323b4b2d08b6ef5129f42e011305b2a34dcf43a48030d5", strip_prefix = "ngraph-0.11.0", urls = [ - "http://mirror.tensorflow.org/github.com/NervanaSystems/ngraph/archive/v0.11.0.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NervanaSystems/ngraph/archive/v0.11.0.tar.gz", "https://github.com/NervanaSystems/ngraph/archive/v0.11.0.tar.gz", ], ) @@ -902,7 +902,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "c377963a95989270c943d522bfefe7b889ef5ed0e1e15d535fd6f6f16ed70732", strip_prefix = "json-3.4.0", urls = [ - "http://mirror.tensorflow.org/github.com/nlohmann/json/archive/v3.4.0.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nlohmann/json/archive/v3.4.0.tar.gz", "https://github.com/nlohmann/json/archive/v3.4.0.tar.gz", ], ) @@ -913,7 +913,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): sha256 = "742a642d2c6622277df4c902b6830d616d0539cc8cd843d6cdb899bb99e66e36", strip_prefix = "ngraph-tf-0.9.0", urls = [ - "http://mirror.tensorflow.org/github.com/NervanaSystems/ngraph-tf/archive/v0.9.0.zip", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/NervanaSystems/ngraph-tf/archive/v0.9.0.zip", "https://github.com/NervanaSystems/ngraph-tf/archive/v0.9.0.zip", ], ) @@ -936,7 +936,7 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): strip_prefix = "wrapt-1.11.1/src/wrapt", system_build_file = clean_dep("//third_party/systemlibs:wrapt.BUILD"), urls = [ - "http://mirror.tensorflow.org/github.com/GrahamDumpleton/wrapt/archive/1.11.1.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/GrahamDumpleton/wrapt/archive/1.11.1.tar.gz", "https://github.com/GrahamDumpleton/wrapt/archive/1.11.1.tar.gz", ], ) diff --git a/third_party/aws/workspace.bzl b/third_party/aws/workspace.bzl index 81c22dde537..f37699e34c5 100644 --- a/third_party/aws/workspace.bzl +++ b/third_party/aws/workspace.bzl @@ -9,7 +9,7 @@ def repo(): third_party_http_archive( name = "aws", urls = [ - "http://mirror.tensorflow.org/github.com/aws/aws-sdk-cpp/archive/1.5.8.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/aws/aws-sdk-cpp/archive/1.5.8.tar.gz", "https://github.com/aws/aws-sdk-cpp/archive/1.5.8.tar.gz", ], sha256 = "89905075fe50aa13e0337ff905c2e8c1ce9caf77a3504484a7cda39179120ffc", diff --git a/third_party/flatbuffers/workspace.bzl b/third_party/flatbuffers/workspace.bzl index 5a64d80d053..5bf25c51e12 100644 --- a/third_party/flatbuffers/workspace.bzl +++ b/third_party/flatbuffers/workspace.bzl @@ -8,7 +8,7 @@ def repo(): strip_prefix = "flatbuffers-1.11.0", sha256 = "3f4a286642094f45b1b77228656fbd7ea123964f19502f9ecfd29933fd23a50b", urls = [ - "http://mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz", "https://github.com/google/flatbuffers/archive/v1.11.0.tar.gz", ], build_file = "//third_party/flatbuffers:BUILD.bazel", diff --git a/third_party/highwayhash/workspace.bzl b/third_party/highwayhash/workspace.bzl index dbec1ffea82..1a698aef918 100644 --- a/third_party/highwayhash/workspace.bzl +++ b/third_party/highwayhash/workspace.bzl @@ -6,7 +6,7 @@ def repo(): third_party_http_archive( name = "highwayhash", urls = [ - "http://mirror.tensorflow.org/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz", "https://github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz", ], sha256 = "9c3e0e87d581feeb0c18d814d98f170ff23e62967a2bd6855847f0b2fe598a37", diff --git a/third_party/hwloc/workspace.bzl b/third_party/hwloc/workspace.bzl index 3c7373a451c..dc8e1579e9c 100644 --- a/third_party/hwloc/workspace.bzl +++ b/third_party/hwloc/workspace.bzl @@ -6,7 +6,7 @@ def repo(): third_party_http_archive( name = "hwloc", urls = [ - "http://mirror.tensorflow.org/download.open-mpi.org/release/hwloc/v2.0/hwloc-2.0.3.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/download.open-mpi.org/release/hwloc/v2.0/hwloc-2.0.3.tar.gz", "https://download.open-mpi.org/release/hwloc/v2.0/hwloc-2.0.3.tar.gz", ], sha256 = "64def246aaa5b3a6e411ce10932a22e2146c3031b735c8f94739534f06ad071c", diff --git a/third_party/icu/workspace.bzl b/third_party/icu/workspace.bzl index 9ea63563840..ddd309a3ee6 100644 --- a/third_party/icu/workspace.bzl +++ b/third_party/icu/workspace.bzl @@ -13,7 +13,7 @@ def repo(): strip_prefix = "icu-release-62-1", sha256 = "e15ffd84606323cbad5515bf9ecdf8061cc3bf80fb883b9e6aa162e485aa9761", urls = [ - "http://mirror.tensorflow.org/github.com/unicode-org/icu/archive/release-62-1.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/unicode-org/icu/archive/release-62-1.tar.gz", "https://github.com/unicode-org/icu/archive/release-62-1.tar.gz", ], build_file = "//third_party/icu:BUILD.bazel", diff --git a/third_party/jpeg/workspace.bzl b/third_party/jpeg/workspace.bzl index f11dfd15e23..831e954779d 100644 --- a/third_party/jpeg/workspace.bzl +++ b/third_party/jpeg/workspace.bzl @@ -6,7 +6,7 @@ def repo(): third_party_http_archive( name = "jpeg", urls = [ - "http://mirror.tensorflow.org/github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz", "https://github.com/libjpeg-turbo/libjpeg-turbo/archive/2.0.0.tar.gz", ], sha256 = "f892fff427ab3adffc289363eac26d197ce3ccacefe5f5822377348a8166069b", diff --git a/third_party/keras_applications_archive/workspace.bzl b/third_party/keras_applications_archive/workspace.bzl index 1530ed8099d..bd92f18a9f2 100644 --- a/third_party/keras_applications_archive/workspace.bzl +++ b/third_party/keras_applications_archive/workspace.bzl @@ -8,7 +8,7 @@ def repo(): strip_prefix = "keras-applications-1.0.8", sha256 = "7c37f9e9ef93efac9b4956301cb21ce46c474ce9da41fac9a46753bab6823dfc", urls = [ - "http://mirror.tensorflow.org/github.com/keras-team/keras-applications/archive/1.0.8.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/keras-team/keras-applications/archive/1.0.8.tar.gz", "https://github.com/keras-team/keras-applications/archive/1.0.8.tar.gz", ], build_file = "//third_party/keras_applications_archive:BUILD.bazel", diff --git a/third_party/kissfft/workspace.bzl b/third_party/kissfft/workspace.bzl index f3679c7d0cf..f8e28c92160 100644 --- a/third_party/kissfft/workspace.bzl +++ b/third_party/kissfft/workspace.bzl @@ -8,7 +8,7 @@ def repo(): strip_prefix = "kissfft-36dbc057604f00aacfc0288ddad57e3b21cfc1b8", sha256 = "42b7ef406d5aa2d57a7b3b56fc44e8ad3011581692458a69958a911071efdcf2", urls = [ - "http://mirror.tensorflow.org/github.com/mborgerding/kissfft/archive/36dbc057604f00aacfc0288ddad57e3b21cfc1b8.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/mborgerding/kissfft/archive/36dbc057604f00aacfc0288ddad57e3b21cfc1b8.tar.gz", "https://github.com/mborgerding/kissfft/archive/36dbc057604f00aacfc0288ddad57e3b21cfc1b8.tar.gz", ], build_file = "//third_party/kissfft:BUILD.bazel", diff --git a/third_party/mlir/mlir_configure.bzl b/third_party/mlir/mlir_configure.bzl index ad6037b3d3b..ade32db3da2 100644 --- a/third_party/mlir/mlir_configure.bzl +++ b/third_party/mlir/mlir_configure.bzl @@ -7,7 +7,7 @@ def _mlir_autoconf_impl(repository_ctx): """Implementation of the mlir_configure repository rule.""" repository_ctx.download_and_extract( [ - "http://mirror.tensorflow.org/github.com/tensorflow/mlir/archive/{}.zip".format(_MLIR_REV), + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/tensorflow/mlir/archive/{}.zip".format(_MLIR_REV), "https://github.com/tensorflow/mlir/archive/{}.zip".format(_MLIR_REV), ], sha256 = _MLIR_SHA256, diff --git a/third_party/nasm/workspace.bzl b/third_party/nasm/workspace.bzl index af8c7d4d42f..2f474f8e032 100644 --- a/third_party/nasm/workspace.bzl +++ b/third_party/nasm/workspace.bzl @@ -6,7 +6,7 @@ def repo(): third_party_http_archive( name = "nasm", urls = [ - "http://mirror.tensorflow.org/www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2", + "https://storage.googleapis.com/mirror.tensorflow.org/www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2", "http://pkgs.fedoraproject.org/repo/pkgs/nasm/nasm-2.13.03.tar.bz2/sha512/d7a6b4cee8dfd603d8d4c976e5287b5cc542fa0b466ff989b743276a6e28114e64289bf02a7819eca63142a5278aa6eed57773007e5f589e15768e6456a8919d/nasm-2.13.03.tar.bz2", "http://www.nasm.us/pub/nasm/releasebuilds/2.13.03/nasm-2.13.03.tar.bz2", ], diff --git a/third_party/ortools/workspace.bzl b/third_party/ortools/workspace.bzl index ca7d62dfb61..b6ebddf2548 100644 --- a/third_party/ortools/workspace.bzl +++ b/third_party/ortools/workspace.bzl @@ -6,7 +6,7 @@ def repo(): third_party_http_archive( name = "ortools_archive", urls = [ - "http://mirror.tensorflow.org/github.com/google/or-tools/archive/v6.7.2.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/or-tools/archive/v6.7.2.tar.gz", "https://github.com/google/or-tools/archive/v6.7.2.tar.gz", ], sha256 = "d025a95f78b5fc5eaa4da5f395f23d11c23cf7dbd5069f1f627f002de87b86b9", diff --git a/third_party/pasta/workspace.bzl b/third_party/pasta/workspace.bzl index 7cd30c3b927..faf55c06075 100644 --- a/third_party/pasta/workspace.bzl +++ b/third_party/pasta/workspace.bzl @@ -6,7 +6,7 @@ def repo(): third_party_http_archive( name = "pasta", urls = [ - "http://mirror.tensorflow.org/github.com/google/pasta/archive/v0.1.2.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/pasta/archive/v0.1.2.tar.gz", "https://github.com/google/pasta/archive/v0.1.2.tar.gz", ], strip_prefix = "pasta-0.1.2", diff --git a/third_party/toolchains/preconfig/generate/archives.bzl b/third_party/toolchains/preconfig/generate/archives.bzl index 7d1cbc719de..8d4dae584dd 100644 --- a/third_party/toolchains/preconfig/generate/archives.bzl +++ b/third_party/toolchains/preconfig/generate/archives.bzl @@ -6,7 +6,7 @@ def bazel_toolchains_archive(): sha256 = "88e818f9f03628eef609c8429c210ecf265ffe46c2af095f36c7ef8b1855fef5", strip_prefix = "bazel-toolchains-92dd8a7a518a2fb7ba992d47c8b38299fe0be825", urls = [ - "http://mirror.tensorflow.org/github.com/bazelbuild/bazel-toolchains/archive/92dd8a7a518a2fb7ba992d47c8b38299fe0be825.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/bazel-toolchains/archive/92dd8a7a518a2fb7ba992d47c8b38299fe0be825.tar.gz", "https://github.com/bazelbuild/bazel-toolchains/archive/92dd8a7a518a2fb7ba992d47c8b38299fe0be825.tar.gz", ], ) From ec74517c27c557f4bb16217cfd03556964fd787d Mon Sep 17 00:00:00 2001 From: Paul Andrey Date: Wed, 24 Jul 2019 09:49:35 +0200 Subject: [PATCH 0448/3053] Attempt at fixing noise_test.py Some builds failed to run `from tensorflow.python import dtypes`. This commit replaces this import with `tensorflow.python.keras.backend.dtypes_module`, in hope that it will be compatible with the various versions of the API currently maintained. --- tensorflow/python/keras/layers/noise_test.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/keras/layers/noise_test.py b/tensorflow/python/keras/layers/noise_test.py index b860ff9ae55..016b21178ef 100644 --- a/tensorflow/python/keras/layers/noise_test.py +++ b/tensorflow/python/keras/layers/noise_test.py @@ -20,8 +20,8 @@ from __future__ import print_function import numpy as np -from tensorflow.python import dtypes from tensorflow.python import keras +from tensorflow.python.keras.backend import dtypes_module from tensorflow.python.keras import keras_parameterized from tensorflow.python.keras import testing_utils from tensorflow.python.platform import test @@ -53,7 +53,7 @@ class NoiseLayersTest(keras_parameterized.TestCase): @staticmethod def _make_model(dtype, gtype): - assert dtype in (dtypes.float32, dtypes.float64) + assert dtype in (dtypes_module.float32, dtypes_module.float64) assert gtype in ('noise', 'dropout') model = keras.Sequential() model.add(keras.layers.Dense(8, input_shape=(32,), dtype=dtype)) @@ -74,16 +74,16 @@ class NoiseLayersTest(keras_parameterized.TestCase): model.train_on_batch(np.zeros((8, 32)), np.zeros((8, 8))) def test_noise_float32(self): - self._train_model(dtypes.float32, 'noise') + self._train_model(dtypes_module.float32, 'noise') def test_noise_float64(self): - self._train_model(dtypes.float64, 'noise') + self._train_model(dtypes_module.float64, 'noise') def test_dropout_float32(self): - self._train_model(dtypes.float32, 'dropout') + self._train_model(dtypes_module.float32, 'dropout') def test_dropout_float64(self): - self._train_model(dtypes.float64, 'dropout') + self._train_model(dtypes_module.float64, 'dropout') if __name__ == '__main__': From d21adc55ad4f7e56544bec0db3c755ecc18e98f3 Mon Sep 17 00:00:00 2001 From: Gunhan Gulsoy Date: Wed, 24 Jul 2019 00:50:18 -0700 Subject: [PATCH 0449/3053] Disable broken model_subclassing_test on windows PiperOrigin-RevId: 259690740 --- tensorflow/python/keras/BUILD | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD index e0d9c0a3872..b48d3c86e79 100755 --- a/tensorflow/python/keras/BUILD +++ b/tensorflow/python/keras/BUILD @@ -1569,7 +1569,10 @@ tf_py_test( "//tensorflow/python:client_testlib", ], shard_count = 4, - tags = ["notsan"], + tags = [ + "no_windows", + "notsan", + ], ) tf_py_test( From 93802f756739f8eed9c8d3d654be74a20467f2a9 Mon Sep 17 00:00:00 2001 From: Gunhan Gulsoy Date: Wed, 24 Jul 2019 01:00:11 -0700 Subject: [PATCH 0450/3053] Automated rollback of commit 2a4b5a3f239b667e2720e73b3048c9896659b0bb PiperOrigin-RevId: 259691812 --- tensorflow/core/BUILD | 32 ------ tensorflow/core/kernels/data/BUILD | 1 - .../kernels/data/unbounded_thread_pool.cc | 97 +++++++++++++--- .../core/kernels/data/unbounded_thread_pool.h | 36 ++++-- .../data/unbounded_thread_pool_test.cc | 62 ++++++++++- .../platform/default/unbounded_work_queue.cc | 101 ----------------- .../platform/default/unbounded_work_queue.h | 65 ----------- .../core/platform/unbounded_work_queue.h | 33 ------ .../platform/unbounded_work_queue_test.cc | 104 ------------------ 9 files changed, 174 insertions(+), 357 deletions(-) delete mode 100644 tensorflow/core/platform/default/unbounded_work_queue.cc delete mode 100644 tensorflow/core/platform/default/unbounded_work_queue.h delete mode 100644 tensorflow/core/platform/unbounded_work_queue.h delete mode 100644 tensorflow/core/platform/unbounded_work_queue_test.cc diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index edd9e05b1af..89b9e2fb73f 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -626,38 +626,6 @@ filegroup( visibility = ["//visibility:private"], ) -cc_library( - name = "platform_unbounded_work_queue", - srcs = tf_platform_srcs([ - "unbounded_work_queue.cc", - ]) + tf_platform_hdrs([ - "unbounded_work_queue.h", - ]), - hdrs = ["platform/unbounded_work_queue.h"], - deps = [ - ":core_cpu_internal", - ":framework", - ":lib", - "@com_google_absl//absl/memory", - ], -) - -tf_cc_test( - name = "platform_unbounded_work_queue_test", - srcs = ["platform/unbounded_work_queue_test.cc"], - deps = [ - ":framework", - ":lib", - ":lib_internal", - ":lib_test_internal", - ":platform_unbounded_work_queue", - ":protos_all_cc", - ":test", - ":test_main", - "@com_google_absl//absl/memory", - ], -) - # Headers that are not exported as part of ":lib". filegroup( name = "platform_other_internal_hdrs", diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD index 8905641536e..a5f41b6dcae 100644 --- a/tensorflow/core/kernels/data/BUILD +++ b/tensorflow/core/kernels/data/BUILD @@ -180,7 +180,6 @@ cc_library( "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", "//tensorflow/core:lib", - "//tensorflow/core:platform_unbounded_work_queue", "@com_google_absl//absl/memory", ], ) diff --git a/tensorflow/core/kernels/data/unbounded_thread_pool.cc b/tensorflow/core/kernels/data/unbounded_thread_pool.cc index 9bb8f4e92e6..ac12197f1b8 100644 --- a/tensorflow/core/kernels/data/unbounded_thread_pool.cc +++ b/tensorflow/core/kernels/data/unbounded_thread_pool.cc @@ -16,9 +16,8 @@ limitations under the License. #include "tensorflow/core/kernels/data/unbounded_thread_pool.h" #include "absl/memory/memory.h" -#include "tensorflow/core/lib/core/notification.h" #include "tensorflow/core/platform/env.h" -#include "tensorflow/core/platform/unbounded_work_queue.h" +#include "tensorflow/core/platform/mutex.h" namespace tensorflow { namespace data { @@ -31,7 +30,7 @@ class UnboundedThreadPool::LogicalThreadFactory : public ThreadFactory { std::unique_ptr StartThread(const string& name, std::function fn) override { - return pool_->ScheduleOnWorkQueue(std::move(fn)); + return pool_->RunOnPooledThread(std::move(fn)); } private: @@ -53,7 +52,8 @@ class UnboundedThreadPool::LogicalThreadWrapper : public Thread { // NOTE: The `Thread` destructor is expected to "join" the created thread, // but the physical thread may continue to execute after the work for this // thread is complete. We simulate this by waiting on a notification that - // the thread's work function will notify when it is complete. + // the `CachedThreadFunc` will notify when the thread's work function is + // complete. join_notification_->WaitForNotification(); } @@ -61,25 +61,96 @@ class UnboundedThreadPool::LogicalThreadWrapper : public Thread { std::shared_ptr join_notification_; }; +UnboundedThreadPool::~UnboundedThreadPool() { + { + mutex_lock l(work_queue_mu_); + // Wake up all `CachedThreadFunc` threads and cause them to terminate before + // joining them when `threads_` is cleared. + cancelled_ = true; + work_queue_cv_.notify_all(); + if (!work_queue_.empty()) { + LOG(ERROR) << "UnboundedThreadPool named \"" << thread_name_ << "\" was " + << "deleted with pending work in its queue. This may indicate " + << "a potential use-after-free bug."; + } + } + + { + mutex_lock l(thread_pool_mu_); + // Clear the list of pooled threads, which will eventually terminate due to + // the previous notification. + // + // NOTE: It is safe to do this while holding `pooled_threads_mu_`, because + // no subsequent calls to `this->StartThread()` should be issued after the + // destructor starts. + thread_pool_.clear(); + } +} + std::shared_ptr UnboundedThreadPool::get_thread_factory() { return std::make_shared(this); } -namespace { -void WorkQueueFunc(const std::function& fn, - std::shared_ptr notification) { - fn(); - notification->Notify(); +size_t UnboundedThreadPool::size() { + tf_shared_lock l(thread_pool_mu_); + return thread_pool_.size(); } -} // namespace -std::unique_ptr UnboundedThreadPool::ScheduleOnWorkQueue( +std::unique_ptr UnboundedThreadPool::RunOnPooledThread( std::function fn) { auto join_notification = std::make_shared(); - unbounded_work_queue_.Schedule( - std::bind(&WorkQueueFunc, std::move(fn), join_notification)); + bool all_threads_busy; + { + // Enqueue a work item for the new thread's function, and wake up a + // cached thread to process it. + mutex_lock l(work_queue_mu_); + work_queue_.push_back({std::move(fn), join_notification}); + work_queue_cv_.notify_one(); + // NOTE: The queue may be non-empty, so we must account for queued work when + // considering how many threads are free. + all_threads_busy = work_queue_.size() > num_idle_threads_; + } + + if (all_threads_busy) { + // Spawn a new physical thread to process the given function. + // NOTE: `PooledThreadFunc` will eventually increment `num_idle_threads_` + // at the beginning of its work loop. + Thread* new_thread = env_->StartThread( + {}, thread_name_, + std::bind(&UnboundedThreadPool::PooledThreadFunc, this)); + + mutex_lock l(thread_pool_mu_); + thread_pool_.emplace_back(new_thread); + } + return absl::make_unique(std::move(join_notification)); } +void UnboundedThreadPool::PooledThreadFunc() { + while (true) { + WorkItem work_item; + { + mutex_lock l(work_queue_mu_); + ++num_idle_threads_; + while (!cancelled_ && work_queue_.empty()) { + // Wait for a new work function to be submitted, or the cache to be + // destroyed. + work_queue_cv_.wait(l); + } + if (cancelled_) { + return; + } + work_item = std::move(work_queue_.front()); + work_queue_.pop_front(); + --num_idle_threads_; + } + + work_item.work_function(); + + // Notify any thread that has "joined" the cached thread for this work item. + work_item.done_notification->Notify(); + } +} + } // namespace data } // namespace tensorflow diff --git a/tensorflow/core/kernels/data/unbounded_thread_pool.h b/tensorflow/core/kernels/data/unbounded_thread_pool.h index 90a54b9b19f..c84d495b296 100644 --- a/tensorflow/core/kernels/data/unbounded_thread_pool.h +++ b/tensorflow/core/kernels/data/unbounded_thread_pool.h @@ -20,33 +20,55 @@ limitations under the License. #include #include "tensorflow/core/framework/thread_factory.h" +#include "tensorflow/core/lib/core/notification.h" #include "tensorflow/core/platform/env.h" -#include "tensorflow/core/platform/unbounded_work_queue.h" +#include "tensorflow/core/platform/mutex.h" namespace tensorflow { namespace data { // An `UnboundedThreadPool` provides a mechanism for temporally multiplexing a // potentially large number of "logical" threads onto a smaller number of -// "physical" threads. The multiplexing is achieved by using an -// `UnboundedWorkQueue`. +// "physical" threads. The multiplexing is achieved by maintaining an internal +// pool of long-running "physical" threads that are used to execute the +// "logical" threads. Like a regular thread, a "logical" thread may block on +// other threads, and the size of the pool will increase to ensure that progress +// is made. This mechanism is recommended in situations where short-lived +// threads are created repeatedly, to avoid the overhead and memory +// fragmentation that can result from excessive thread creation. class UnboundedThreadPool { public: UnboundedThreadPool(Env* env, const string& thread_name) - : unbounded_work_queue_(env, thread_name) {} - ~UnboundedThreadPool() = default; + : env_(env), thread_name_(thread_name) {} + ~UnboundedThreadPool(); // Returns an implementation of `ThreadFactory` that can be used to create // logical threads in this pool. std::shared_ptr get_thread_factory(); + // Returns the current number of threads in this pool. + size_t size(); + private: class LogicalThreadFactory; class LogicalThreadWrapper; + struct WorkItem { + std::function work_function; + std::shared_ptr done_notification; + }; - std::unique_ptr ScheduleOnWorkQueue(std::function fn); + std::unique_ptr RunOnPooledThread(std::function fn); + void PooledThreadFunc(); - UnboundedWorkQueue unbounded_work_queue_; + Env* const env_; // Not owned. + const string thread_name_; + mutex work_queue_mu_; + condition_variable work_queue_cv_ GUARDED_BY(work_queue_mu_); + size_t num_idle_threads_ GUARDED_BY(work_queue_mu_) = 0; + bool cancelled_ GUARDED_BY(work_queue_mu_) = false; + std::deque work_queue_ GUARDED_BY(work_queue_mu_); + mutex thread_pool_mu_; + std::vector> thread_pool_ GUARDED_BY(thread_pool_mu_); }; } // namespace data diff --git a/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc b/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc index 3604be86473..f996b4f931b 100644 --- a/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc +++ b/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc @@ -23,6 +23,59 @@ namespace tensorflow { namespace data { namespace { +TEST(UnboundedThreadPool, SingleThread) { + UnboundedThreadPool pool(Env::Default(), "test"); + auto thread_factory = pool.get_thread_factory(); + + // Create a thread that updates a variable, and ensure that it runs to + // completion. + std::atomic i(0); + auto thread = thread_factory->StartThread("", [&i]() { ++i; }); + thread.reset(); + + EXPECT_GE(pool.size(), 1); + EXPECT_EQ(1, i); +} + +TEST(UnboundedThreadPool, MultipleThreads) { + UnboundedThreadPool pool(Env::Default(), "test"); + auto thread_factory = pool.get_thread_factory(); + + // Create ten threads that update a variable, and ensure that they all run + // to completion. + std::vector> threads; + const int kNumThreadsToCreate = 10; + std::atomic i(0); + for (int j = 0; j < kNumThreadsToCreate; ++j) { + threads.push_back(thread_factory->StartThread("", [&i]() { ++i; })); + } + threads.clear(); + + EXPECT_GE(pool.size(), 1); + EXPECT_EQ(i, kNumThreadsToCreate); +} + +TEST(UnboundedThreadPool, MultipleThreadsSleepingRandomly) { + UnboundedThreadPool pool(Env::Default(), "test"); + auto thread_factory = pool.get_thread_factory(); + + // Create 1000 threads that sleep for a random period of time then update a + // variable, and ensure that they all run to completion. + std::vector> threads; + const int kNumThreadsToCreate = 1000; + std::atomic i(0); + for (int j = 0; j < kNumThreadsToCreate; ++j) { + threads.push_back(thread_factory->StartThread("", [&i]() { + Env::Default()->SleepForMicroseconds(random::New64() % 10); + ++i; + })); + } + threads.clear(); + + EXPECT_GE(pool.size(), 1); + EXPECT_EQ(i, kNumThreadsToCreate); +} + TEST(UnboundedThreadPool, ConcurrentThreadCreation) { UnboundedThreadPool pool(Env::Default(), "test"); auto thread_factory = pool.get_thread_factory(); @@ -44,6 +97,7 @@ TEST(UnboundedThreadPool, ConcurrentThreadCreation) { } threads.clear(); + EXPECT_GE(pool.size(), 1); EXPECT_EQ(i, kNumThreadsToCreate * kNumThreadsToCreate); } @@ -54,7 +108,9 @@ TEST(UnboundedThreadPool, MultipleBlockingThreads) { std::vector> threads; // Create multiple waves (with increasing sizes) of threads that all block - // before returning, and ensure that we terminate correctly. + // before returning, and + // ensure that we create the appropriate number of threads and terminate + // correctly. std::vector round_sizes = {5, 10, 15, 20}; for (const int round_size : round_sizes) { @@ -73,6 +129,10 @@ TEST(UnboundedThreadPool, MultipleBlockingThreads) { // wave is increasing, we should have at least that number of threads in the // pool. bc.Wait(); + // NOTE: There is a benign race between a new round starting and the + // physical threads from the previous round returning to the pool, so we may + // create more threads than the round_size. + EXPECT_GE(pool.size(), round_size); n.Notify(); threads.clear(); } diff --git a/tensorflow/core/platform/default/unbounded_work_queue.cc b/tensorflow/core/platform/default/unbounded_work_queue.cc deleted file mode 100644 index 249d6358643..00000000000 --- a/tensorflow/core/platform/default/unbounded_work_queue.cc +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/platform/unbounded_work_queue.h" - -#include "absl/memory/memory.h" -#include "tensorflow/core/platform/env.h" -#include "tensorflow/core/platform/mutex.h" - -namespace tensorflow { - -UnboundedWorkQueue::UnboundedWorkQueue(Env* env, const string& thread_name) - : env_(env), thread_name_(thread_name) {} - -UnboundedWorkQueue::~UnboundedWorkQueue() { - { - mutex_lock l(work_queue_mu_); - // Wake up all `PooledThreadFunc` threads and cause them to terminate before - // joining them when `threads_` is cleared. - cancelled_ = true; - work_queue_cv_.notify_all(); - if (!work_queue_.empty()) { - LOG(ERROR) << "UnboundedWorkQueue named \"" << thread_name_ << "\" was " - << "deleted with pending work in its queue. This may indicate " - << "a potential use-after-free bug."; - } - } - - { - mutex_lock l(thread_pool_mu_); - // Clear the list of pooled threads, which will eventually terminate due to - // the previous notification. - // - // NOTE: It is safe to do this while holding `pooled_threads_mu_`, because - // no subsequent calls to `this->StartThread()` should be issued after the - // destructor starts. - thread_pool_.clear(); - } -} - -void UnboundedWorkQueue::Schedule(WorkFunction fn) { - bool all_threads_busy; - { - // Enqueue a work item for the new thread's function, and wake up a - // cached thread to process it. - mutex_lock l(work_queue_mu_); - work_queue_.push_back(std::move(fn)); - work_queue_cv_.notify_one(); - // NOTE: The queue may be non-empty, so we must account for queued work when - // considering how many threads are free. - all_threads_busy = work_queue_.size() > num_idle_threads_; - } - - if (all_threads_busy) { - // Spawn a new physical thread to process the given function. - // NOTE: `PooledThreadFunc` will eventually increment `num_idle_threads_` - // at the beginning of its work loop. - Thread* new_thread = - env_->StartThread({}, thread_name_, [this]() { PooledThreadFunc(); }); - - mutex_lock l(thread_pool_mu_); - thread_pool_.emplace_back(new_thread); - } -} - -void UnboundedWorkQueue::PooledThreadFunc() { - while (true) { - WorkFunction fn; - { - mutex_lock l(work_queue_mu_); - ++num_idle_threads_; - while (!cancelled_ && work_queue_.empty()) { - // Wait for a new work function to be submitted, or the cache to be - // destroyed. - work_queue_cv_.wait(l); - } - if (cancelled_) { - return; - } - fn = std::move(work_queue_.front()); - work_queue_.pop_front(); - --num_idle_threads_; - } - - fn(); - } -} - -} // namespace tensorflow diff --git a/tensorflow/core/platform/default/unbounded_work_queue.h b/tensorflow/core/platform/default/unbounded_work_queue.h deleted file mode 100644 index cba83622a3a..00000000000 --- a/tensorflow/core/platform/default/unbounded_work_queue.h +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_UNBOUNDED_WORK_QUEUE_H_ -#define TENSORFLOW_CORE_PLATFORM_DEFAULT_UNBOUNDED_WORK_QUEUE_H_ - -#include -#include -#include - -#include "tensorflow/core/lib/core/notification.h" -#include "tensorflow/core/platform/env.h" -#include "tensorflow/core/platform/mutex.h" - -namespace tensorflow { - -// An `UnboundedWorkQueue` provides a mechanism for temporally multiplexing a -// potentially large number of "logical" threads onto a smaller number of -// "physical" threads. The multiplexing is achieved by maintaining an internal -// pool of long-running "physical" threads that are used to execute the -// "logical" threads. Like a regular thread, a "logical" thread may block on -// other threads, and the size of the pool will increase to ensure that progress -// is made. This mechanism is recommended in situations where short-lived -// threads are created repeatedly, to avoid the overhead and memory -// fragmentation that can result from excessive thread creation. -class UnboundedWorkQueue { - public: - UnboundedWorkQueue(Env* env, const string& thread_name); - ~UnboundedWorkQueue(); - - using WorkFunction = std::function; - - // Schedule `fn` on a thread. `fn` may perform blocking work, so if all the - // existing threads are blocked or busy, this may spawn a new thread which - // will be added to the thread pool managed by this work queue. - void Schedule(WorkFunction fn); - - private: - void PooledThreadFunc(); - - Env* const env_; // Not owned. - const string thread_name_; - mutex work_queue_mu_; - condition_variable work_queue_cv_ GUARDED_BY(work_queue_mu_); - size_t num_idle_threads_ GUARDED_BY(work_queue_mu_) = 0; - bool cancelled_ GUARDED_BY(work_queue_mu_) = false; - std::deque work_queue_ GUARDED_BY(work_queue_mu_); - mutex thread_pool_mu_; - std::vector> thread_pool_ GUARDED_BY(thread_pool_mu_); -}; - -} // namespace tensorflow - -#endif // TENSORFLOW_CORE_PLATFORM_DEFAULT_UNBOUNDED_WORK_QUEUE_H_ diff --git a/tensorflow/core/platform/unbounded_work_queue.h b/tensorflow/core/platform/unbounded_work_queue.h deleted file mode 100644 index 242980dafa9..00000000000 --- a/tensorflow/core/platform/unbounded_work_queue.h +++ /dev/null @@ -1,33 +0,0 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#ifndef TENSORFLOW_CORE_PLATFORM_UNBOUNDED_WORK_QUEUE_H_ -#define TENSORFLOW_CORE_PLATFORM_UNBOUNDED_WORK_QUEUE_H_ - -#include "tensorflow/core/platform/platform.h" - -// An `UnboundedWorkQueue` feeds potentially-blocking work into a thread-pool -// whose size automatically increases with demand. - -#if defined(PLATFORM_GOOGLE) -#include "tensorflow/core/platform/google/unbounded_work_queue.h" -#elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \ - defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_WINDOWS) -#include "tensorflow/core/platform/default/unbounded_work_queue.h" -#else -#error Define the appropriate PLATFORM_ macro for this platform -#endif - -#endif // TENSORFLOW_CORE_PLATFORM_UNBOUNDED_WORK_QUEUE_H_ diff --git a/tensorflow/core/platform/unbounded_work_queue_test.cc b/tensorflow/core/platform/unbounded_work_queue_test.cc deleted file mode 100644 index 03d91cd4893..00000000000 --- a/tensorflow/core/platform/unbounded_work_queue_test.cc +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include "tensorflow/core/platform/unbounded_work_queue.h" - -#include "absl/memory/memory.h" -#include "tensorflow/core/lib/core/blocking_counter.h" -#include "tensorflow/core/lib/random/random.h" -#include "tensorflow/core/platform/test.h" - -namespace tensorflow { -namespace { - -class UnboundedWorkQueueTest : public ::testing::Test { - protected: - UnboundedWorkQueueTest() - : work_queue_( - absl::make_unique(Env::Default(), "test")) {} - ~UnboundedWorkQueueTest() override = default; - - void RunMultipleCopiesOfClosure(const int num_closures, - std::function fn) { - for (int i = 0; i < num_closures; ++i) { - work_queue_->Schedule([this, fn]() { - fn(); - mutex_lock l(mu_); - ++closure_count_; - cond_var_.notify_all(); - }); - } - } - - void BlockUntilClosuresDone(const int num_closures) { - mutex_lock l(mu_); - while (closure_count_ < num_closures) { - cond_var_.wait(l); - } - } - - void ResetQueue() { work_queue_.reset(); } - - int NumClosuresExecuted() { - mutex_lock l(mu_); - return closure_count_; - } - - private: - mutex mu_; - int closure_count_ GUARDED_BY(mu_) = 0; - condition_variable cond_var_; - std::unique_ptr work_queue_; -}; - -TEST_F(UnboundedWorkQueueTest, SingleClosure) { - constexpr int num_closures = 1; - RunMultipleCopiesOfClosure(num_closures, []() {}); - BlockUntilClosuresDone(num_closures); -} - -TEST_F(UnboundedWorkQueueTest, MultipleClosures) { - constexpr int num_closures = 10; - RunMultipleCopiesOfClosure(num_closures, []() {}); - BlockUntilClosuresDone(num_closures); -} - -TEST_F(UnboundedWorkQueueTest, MultipleClosuresSleepingRandomly) { - constexpr int num_closures = 1000; - RunMultipleCopiesOfClosure(num_closures, []() { - Env::Default()->SleepForMicroseconds(random::New64() % 10); - }); - BlockUntilClosuresDone(num_closures); -} - -TEST_F(UnboundedWorkQueueTest, NestedClosures) { - constexpr int num_closures = 10; - // Run `num_closures` closures, each of which runs `num_closures` closures. - RunMultipleCopiesOfClosure(num_closures, [this]() { - RunMultipleCopiesOfClosure(num_closures, []() {}); - }); - BlockUntilClosuresDone(num_closures * num_closures + num_closures); -} - -TEST_F(UnboundedWorkQueueTest, RacyDestructor) { - constexpr int num_closures = 100; - // Run `num_closures` closures, then delete `work_queue_`. - RunMultipleCopiesOfClosure(num_closures, []() {}); - ResetQueue(); - EXPECT_LE(NumClosuresExecuted(), num_closures); -} - -} // namespace -} // namespace tensorflow From cac04f544111a456a3123d90df8ada8f250bde79 Mon Sep 17 00:00:00 2001 From: Vojtech Bardiovsky Date: Wed, 24 Jul 2019 01:42:08 -0700 Subject: [PATCH 0451/3053] Set use_node_name_sharing to True for hash tables. PiperOrigin-RevId: 259697137 --- .../saved_model/function_deserialization.py | 9 +++++++++ .../python/saved_model/load_v1_in_v2_test.py | 16 ++++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py index 599759a0c84..97d9989cde0 100644 --- a/tensorflow/python/saved_model/function_deserialization.py +++ b/tensorflow/python/saved_model/function_deserialization.py @@ -375,6 +375,15 @@ def fix_node_def(node_def, functions, shared_name_suffix, debug_name): if attr_value.func.name: attr_value.func.name = functions[attr_value.func.name].name + # Fix old table creation bug. + if node_def.op == "HashTableV2": + if ("use_node_name_sharing" not in node_def.attr or + not node_def.attr["use_node_name_sharing"].b): + node_def.attr["use_node_name_sharing"].b = True + # We are turning on node mame sharing, so have to make sure we don't + # accidentally share a table resource. + shared_name_suffix += "_{}".format(ops.uid()) + # TODO(b/124205571): Avoid accidental sharing and destruction of restored # resources. For now uniquify "shared_name" when loading functions to avoid # sharing. diff --git a/tensorflow/python/saved_model/load_v1_in_v2_test.py b/tensorflow/python/saved_model/load_v1_in_v2_test.py index 3e61b441d94..906b8198335 100644 --- a/tensorflow/python/saved_model/load_v1_in_v2_test.py +++ b/tensorflow/python/saved_model/load_v1_in_v2_test.py @@ -197,7 +197,7 @@ class LoadTest(test.TestCase): self.evaluate(second_imported.signatures["second_key"]( second_start=constant_op.constant(2.)))) - def _v1_asset_saved_model(self): + def _v1_asset_saved_model(self, clear_shared_name): export_graph = ops.Graph() vocab_path = os.path.join(self.get_temp_dir(), "vocab.txt") with open(vocab_path, "w") as f: @@ -214,6 +214,9 @@ class LoadTest(test.TestCase): start = array_ops.placeholder( shape=None, dtype=dtypes.string, name="in") output = table.lookup(start, name="out") + if clear_shared_name: + export_graph.get_operation_by_name("hash_table")._clear_attr( + "shared_name") with session_lib.Session() as session: session.run([table.initializer]) path = os.path.join(self.get_temp_dir(), "saved_model", str(ops.uid())) @@ -228,7 +231,7 @@ class LoadTest(test.TestCase): @test_util.run_in_graph_and_eager_modes def test_asset_loading(self): - first_path = self._v1_asset_saved_model() + first_path = self._v1_asset_saved_model(clear_shared_name=False) imported = load.load(first_path) self.evaluate(lookup_ops.tables_initializer()) fn = imported.signatures["serving_default"] @@ -256,6 +259,15 @@ class LoadTest(test.TestCase): self.assertAllClose({"output": [2, 0]}, fn(start=constant_op.constant(["gamma", "alpha"]))) + @test_util.run_in_graph_and_eager_modes + def test_node_name_sharing(self): + fourth_path = self._v1_asset_saved_model(clear_shared_name=True) + fourth_import = load.load(fourth_path) + self.evaluate(lookup_ops.tables_initializer()) + fn = fourth_import.signatures["serving_default"] + self.assertAllClose({"output": [2, 0]}, + fn(start=constant_op.constant(["gamma", "alpha"]))) + def _v1_cond_saved_model(self): export_graph = ops.Graph() with export_graph.as_default(): From e2af9187c22da62fa9aef447131f0ca6151e386d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 02:02:18 -0700 Subject: [PATCH 0452/3053] compat: Update forward compatibility horizon to 2019-07-24 PiperOrigin-RevId: 259699507 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index 493f7266b20..0c980024549 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export # This value changes every day with an automatic CL. It can be modified in code # via `forward_compatibility_horizon()` or with the environment variable # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date. -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 7, 23) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 7, 24) _FORWARD_COMPATIBILITY_HORIZON_OVERRIDDEN = False _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS" From 4cef4357cca4308a568f295f67d20338fdc3ae48 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 02:02:20 -0700 Subject: [PATCH 0453/3053] Update GraphDef version to 106. PiperOrigin-RevId: 259699521 --- tensorflow/core/public/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h index 94d81942cb8..304eef492c6 100644 --- a/tensorflow/core/public/version.h +++ b/tensorflow/core/public/version.h @@ -108,7 +108,7 @@ limitations under the License. #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0 -#define TF_GRAPH_DEF_VERSION 105 // Updated: 2019/7/23 +#define TF_GRAPH_DEF_VERSION 106 // Updated: 2019/7/24 // Checkpoint compatibility versions (the versions field in SavedSliceMeta). // From 8d730c0d817cf46e10a817689be82843cf6d975d Mon Sep 17 00:00:00 2001 From: leike666666 Date: Wed, 24 Jul 2019 19:31:19 +0800 Subject: [PATCH 0454/3053] Delete the parameter allow_soft_placement in function AssignDevice --- tensorflow/core/common_runtime/colocation_graph.cc | 4 ++-- tensorflow/core/common_runtime/colocation_graph.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/common_runtime/colocation_graph.cc b/tensorflow/core/common_runtime/colocation_graph.cc index 4fd40a103a0..ac54b8539ee 100644 --- a/tensorflow/core/common_runtime/colocation_graph.cc +++ b/tensorflow/core/common_runtime/colocation_graph.cc @@ -438,7 +438,7 @@ bool Member::MergeSupportedDevices( return true; } -Status Member::AssignDevice(const Node& node, bool allow_soft_placement) { +Status Member::AssignDevice(const Node& node) { if (node.assigned_device_name_index() == assigned_device_name_index_) { return Status::OK(); } @@ -914,7 +914,7 @@ Status ColocationGraph::LimitToAssignedDevice(const Node& node) { } int root = FindAndUpdateRoot(node.id()); Member& root_member = members_[root]; - return root_member.AssignDevice(node, allow_soft_placement_); + return root_member.AssignDevice(node); } void ColocationGraph::GetSoftDeviceCandidates( diff --git a/tensorflow/core/common_runtime/colocation_graph.h b/tensorflow/core/common_runtime/colocation_graph.h index 410b943a34e..1d71a90ad4f 100644 --- a/tensorflow/core/common_runtime/colocation_graph.h +++ b/tensorflow/core/common_runtime/colocation_graph.h @@ -80,7 +80,7 @@ class Member { // not update this. Else returns true and updates this. bool MergeSupportedDevices(const Member& other); - Status AssignDevice(const Node& node, bool allow_soft_placement); + Status AssignDevice(const Node& node); // Limit the possible devices of this (should be a root) to the device // specifications in `devices`. From 5fa3172056c49e234707437892eb3edf20d16855 Mon Sep 17 00:00:00 2001 From: Stephen McGroarty Date: Wed, 24 Jul 2019 12:24:50 +0100 Subject: [PATCH 0455/3053] Make ReplaceInstruction preserve the sharding info. Right now ReplaceInstruction preserves the metadata if the user hasn't specified any on the new instruction. If we don't do this for the sharding information as well optimizations will drop the sharding information. --- tensorflow/compiler/xla/service/hlo_computation.cc | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc index 639e853ada7..b853a2fb530 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.cc +++ b/tensorflow/compiler/xla/service/hlo_computation.cc @@ -835,6 +835,14 @@ Status HloComputation::ReplaceInstruction(HloInstruction* old_instruction, if (new_instruction->metadata().op_name().empty()) { new_instruction->set_metadata(old_instruction->metadata()); } + + // Like the metadata above, if the user didn't specify any sharding + // information on the new instruction we should copy the old sharding + // information (if any). + if (!new_instruction->has_sharding()) { + new_instruction->set_sharding(old_instruction->sharding_ptr()); + } + TF_RETURN_IF_ERROR(old_instruction->ReplaceAllUsesWith(new_instruction)); return RemoveInstructionAndUnusedOperands(old_instruction); } From 219a6bac7228a85d65a6b8b7ce5a19291261bf92 Mon Sep 17 00:00:00 2001 From: captain-pool Date: Wed, 24 Jul 2019 17:47:30 +0530 Subject: [PATCH 0456/3053] Updated SavedModel loading for showing Functions --- tensorflow/python/tools/saved_model_cli.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py index 367670de411..78f75ed173b 100644 --- a/tensorflow/python/tools/saved_model_cli.py +++ b/tensorflow/python/tools/saved_model_cli.py @@ -172,10 +172,8 @@ def _show_defined_functions(saved_model_dir, indent=0): saved_model_dir: Directory containing the SavedModel to inspect. indent: How far (in increments of 2 spaces) to indent each line of output. """ - if context.executing_eagerly(): - # Disable eager execution to prevent loading of checkpoints - ops_lib.disable_eager_execution() - trackable_object = load.load(saved_model_dir) + with ops_lib.Graph().as_default(): + trackable_object = load.load(saved_model_dir) indent_str = ' ' * indent def in_print(s): From 8eae8b76659f0c62efb56836450ec6a69b324819 Mon Sep 17 00:00:00 2001 From: captain-pool Date: Wed, 24 Jul 2019 20:36:27 +0530 Subject: [PATCH 0457/3053] Minor Fixes --- tensorflow/python/tools/saved_model_cli.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py index 78f75ed173b..fc10c8dc9a5 100644 --- a/tensorflow/python/tools/saved_model_cli.py +++ b/tensorflow/python/tools/saved_model_cli.py @@ -36,7 +36,6 @@ from tensorflow.core.example import example_pb2 from tensorflow.core.framework import types_pb2 from tensorflow.python.client import session from tensorflow.python.debug.wrappers import local_cli_wrapper -from tensorflow.python.eager import context from tensorflow.python.framework import meta_graph as meta_graph_lib from tensorflow.python.framework import ops as ops_lib from tensorflow.python.framework import tensor_spec @@ -165,37 +164,32 @@ def _show_inputs_outputs( meta_graph_def.signature_def[signature_def_key].method_name) -def _show_defined_functions(saved_model_dir, indent=0): +def _show_defined_functions(saved_model_dir): """Prints the function definition of SavedModel2.0 located at saved_model_dir Args: saved_model_dir: Directory containing the SavedModel to inspect. - indent: How far (in increments of 2 spaces) to indent each line of output. """ with ops_lib.Graph().as_default(): trackable_object = load.load(saved_model_dir) - indent_str = ' ' * indent - - def in_print(s): - print(indent_str + s) print('Defined Functions:') functions = save._AugmentedGraphView( trackable_object).list_functions(trackable_object) for name, function in functions.items(): - in_print('Function Name: \'%s\'' % name) + print(' Function Name: \'%s\'' % name) for index, concrete_functions in enumerate( function._list_all_concrete_functions_for_serialization(), 1): args, kwargs = (concrete_functions.structured_input_signature) - in_print('Option #%d' % index) - in_print(' Callable with:') + print(' Option #%d' % index) + print(' Callable with:') _print_args(args, indent=3) if kwargs: _print_args(kwargs, "Named Argument", indent=3) def _print_args(arguments, argument_type="Argument", indent=0): - """Formats and prints the argument of the concrete functions defined in the model + """Formats and prints the argument of the concrete functions defined in the model. Args: arguments: Arguments of the concrete functions. @@ -204,7 +198,7 @@ def _print_args(arguments, argument_type="Argument", indent=0): """ indent_str = ' ' * indent - def quotes(value): + def _may_be_add_quotes(value): is_quotes = '\'' * isinstance(value, str) return is_quotes + value + is_quotes @@ -233,7 +227,7 @@ def _print_args(arguments, argument_type="Argument", indent=0): _print_args(element, indent + 1) in_print(' ]') else: - in_print(' \'%s\': %s' % (str(key), quotes(value)), end='') + in_print(' \'%s\': %s' % (str(key), _may_be_add_quotes(value)), end='') in_print(' }') else: in_print(' DType: %s' % type(element).__name__) @@ -305,7 +299,7 @@ def _show_all(saved_model_dir): print('\nsignature_def[\'' + signature_def_key + '\']:') _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key, indent=1) - _show_defined_functions(saved_model_dir, indent=1) + _show_defined_functions(saved_model_dir) def get_meta_graph_def(saved_model_dir, tag_set): From 048a50464d24ac6d799373768bc6cd92e0d10819 Mon Sep 17 00:00:00 2001 From: Karim Nosir Date: Wed, 24 Jul 2019 08:34:35 -0700 Subject: [PATCH 0458/3053] Add Argmax op to TfLite MLIR converter, also fix Argmin missing options. PiperOrigin-RevId: 259746257 --- tensorflow/compiler/mlir/lite/ir/tfl_ops.td | 34 +++++++++++++++++ .../compiler/mlir/lite/tests/legalize-tf.mlir | 16 ++++++++ tensorflow/compiler/mlir/lite/tests/ops.mlir | 16 ++++++++ .../mlir/lite/transforms/legalize_patterns.td | 1 + .../mlir/tensorflow/ir/tf_generated_ops.td | 37 ++++++++++++++++++- 5 files changed, 102 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td index 21f5ce1bf5b..127a86b86ae 100644 --- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td +++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td @@ -430,6 +430,32 @@ def TFL_AveragePool2DOp: let customOption = "Pool2DOptions"; } +def TFL_ArgMaxOp : TFL_Op<"arg_max", [NoSideEffect]> { + let summary = "ArgMax operator"; + + let description = [{ + Returns the index with the largest value across dimensions of a tensor. + }]; + + let arguments = ( + // TODO: Add support for uint8. + ins TensorOf<[F32, I32, I8]>:$input, + TFL_I32OrI64Tensor:$dim + ); + + let results = (outs + TFL_I32OrI64Tensor:$output + ); + + let hasOptions = 1; + + DerivedTFLiteTypeAttr output_type = DerivedTFLiteTypeAttr<[{ + return getResult()->getType().cast().getElementType(). + cast().getWidth() > 32 ? tflite::TensorType_INT64 : + tflite::TensorType_INT32; + }]>; +} + def TFL_ArgMinOp : TFL_Op<"arg_min", [NoSideEffect]> { let summary = "ArgMin operator"; @@ -449,6 +475,14 @@ def TFL_ArgMinOp : TFL_Op<"arg_min", [NoSideEffect]> { let results = (outs TFL_I32OrI64Tensor:$output ); + + let hasOptions = 1; + + DerivedTFLiteTypeAttr output_type = DerivedTFLiteTypeAttr<[{ + return getResult()->getType().cast().getElementType(). + cast().getWidth() > 32 ? tflite::TensorType_INT64 : + tflite::TensorType_INT32; + }]>; } def TFL_CeilOp: TFL_Op<"ceil", [NoSideEffect, SameOperandsAndResultType]> { diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir index 616922ba8d3..539cf8fffa6 100644 --- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir +++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir @@ -941,3 +941,19 @@ func @OneHot(%arg0: tensor<3xi32>, %arg1: tensor, %arg2: tensor, %arg3 // CHECK-LABEL: OneHot // CHECK: "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) {axis = -1 : i32} : (tensor<3xi32>, tensor, tensor, tensor) -> tensor<*xf32> } + +func @argmax(%arg0: tensor<3xi32>, %arg1: tensor) -> tensor { + %0 = "tf.ArgMax"(%arg0, %arg1) : (tensor<3xi32>, tensor) -> tensor + return %0 : tensor + +// CHECK-LABEL: argmax +// CHECK: %0 = "tfl.arg_max"(%arg0, %arg1) : (tensor<3xi32>, tensor) -> tensor +} + +func @argmax64(%arg0: tensor<3xi32>, %arg1: tensor) -> tensor { + %0 = "tf.ArgMax"(%arg0, %arg1) : (tensor<3xi32>, tensor) -> tensor + return %0 : tensor + +// CHECK-LABEL: argmax64 +// CHECK: %0 = "tfl.arg_max"(%arg0, %arg1) : (tensor<3xi32>, tensor) -> tensor +} \ No newline at end of file diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir index aaa560c0fd6..ec31bf34b70 100644 --- a/tensorflow/compiler/mlir/lite/tests/ops.mlir +++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir @@ -917,3 +917,19 @@ func @testOneHotWithInvalidOutputType(%arg0: tensor<3xi32>, %arg1: tensor, %0 = "tfl.one_hot"(%arg0, %arg1, %arg2, %arg3) {axis = -1 : i32} : (tensor<3xi32>, tensor, tensor, tensor) -> tensor<*xi8> return %0 : tensor<*xi8> } + +// ----- + +func @testArgMax(%arg0: tensor<3xi32>, %arg1: tensor) -> tensor { + // CHECK: "tfl.arg_max"(%arg0, %arg1) {output_type = 2 : i32} : (tensor<3xi32>, tensor) -> tensor + %0 = "tfl.arg_max"(%arg0, %arg1) {output_type = 2 : i32} : (tensor<3xi32>, tensor) -> tensor + return %0 : tensor +} + +// ----- + +func @testArgMin(%arg0: tensor<3xi32>, %arg1: tensor) -> tensor { + // CHECK: "tfl.arg_min"(%arg0, %arg1) {output_type = 2 : i32} : (tensor<3xi32>, tensor) -> tensor + %0 = "tfl.arg_min"(%arg0, %arg1) {output_type = 2 : i32} : (tensor<3xi32>, tensor) -> tensor + return %0 : tensor +} diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td index 90ff6713874..19ea5aa24fe 100644 --- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td +++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td @@ -80,6 +80,7 @@ def : Pat<(TF_AvgPoolOp $value, /*stride_w=*/ExtractI32At<2>:$strides, /*fused_activation_function=*/TFL_AF_None)>; +def : Pat<(TF_ArgMaxOp $input, $dim), (TFL_ArgMaxOp $input, $dim)>; def : Pat<(TF_ArgMinOp $input, $dim), (TFL_ArgMinOp $input, $dim)>; def : Pat<(TF_CeilOp $arg), (TFL_CeilOp $arg)>; diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td index 9c256034c2b..a748e29ca26 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td @@ -123,6 +123,39 @@ def TF_AddV2Op : TF_Op<"AddV2", [Broadcastable, Commutative, NoSideEffect]>, let hasCanonicalizer = 1; } +def TF_ArgMaxOp : TF_Op<"ArgMax", [NoSideEffect]> { + let summary = [{ +Returns the index with the largest value across dimensions of a tensor. + }]; + + let description = [{ +Note that in case of ties the identity of the return value is not guaranteed. + +Usage: + ```python + import tensorflow as tf + a = [1, 10, 26.9, 2.8, 166.32, 62.3] + b = tf.math.argmax(input = a) + c = tf.keras.backend.eval(b) + # c = 4 + # here a[4] = 166.32 which is the largest element of a across axis 0 + ``` + }]; + + let arguments = (ins + TF_NumberTensor:$input, + TF_I32OrI64Tensor:$dimension + ); + + let results = (outs + TF_I32OrI64Tensor:$output + ); + + TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>; + TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>; + TF_DerivedResultTypeAttr output_type = TF_DerivedResultTypeAttr<0>; +} + def TF_ArgMinOp : TF_Op<"ArgMin", [NoSideEffect]> { let summary = [{ Returns the index with the smallest value across dimensions of a tensor. @@ -1224,10 +1257,10 @@ for dtype in dtype_list: input_tensor, bitwise_ops.invert(input_tensor)), bitwise_ops.invert( tf.constant(0, dtype=dtype))] - + expected = tf.constant([0, 0, 0, 0], dtype=tf.float32) tf.assert_equal(tf.cast(not_a_and_a, tf.float32), expected) - + expected = tf.cast([not_0] * 4, tf.float32) tf.assert_equal(tf.cast(not_a_or_a, tf.float32), expected) From 61c3805ce3bc6f98148400d98f440f56fea18045 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Wed, 24 Jul 2019 08:39:58 -0700 Subject: [PATCH 0459/3053] Add option to strip debug info on export. Start with not emitting internal buffer names on export if strip-debug-info is specified (default off). The name returned for internal buffer names are simply sequential digits as a start (this could be further reduced and further debug info also stripped). This is only the flatbuffer export side changes and the internal representation is not changed. PiperOrigin-RevId: 259747147 --- .../mlir/lite/flatbuffer_translate.cc | 38 ++++++++++++++----- .../compiler/mlir/lite/flatbuffer_translate.h | 2 + .../lite/tests/mlir2flatbuffer/simple.mlir | 11 ++++++ 3 files changed, 41 insertions(+), 10 deletions(-) diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc index ab17d62fa53..5f460b45c16 100644 --- a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc +++ b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc @@ -105,9 +105,10 @@ using llvm::cl::opt; // These command line flags enable control of the translation implementation. bool emit_builtin_tflite_ops; -bool emit_select_tf_ops; bool emit_custom_ops; +bool emit_select_tf_ops; bool lower_tensor_list_ops; +bool strip_debug_info; // NOLINTNEXTLINE static opt emit_builtin_tflite_ops_flag( @@ -117,7 +118,7 @@ static opt emit_builtin_tflite_ops_flag( llvm::cl::location(emit_builtin_tflite_ops), llvm::cl::init(true)); // NOLINTNEXTLINE -static opt emit_select_tf_Ops_flag( +static opt emit_select_tf_ops_flag( "emit-select-tf-ops", llvm::cl::desc( "Emit Select TF operations (Flex ops) in the generated TFLite model"), @@ -135,6 +136,11 @@ static opt lower_tensor_list_ops_flag( llvm::cl::desc("Lower the TensorList ops within the TFLite dialect"), llvm::cl::location(lower_tensor_list_ops), llvm::cl::init(false)); +// NOLINTNEXTLINE +static opt strip_debug_info_flag( + "strip-debug-info", llvm::cl::desc("Strip debug info during export"), + llvm::cl::location(strip_debug_info), llvm::cl::init(false)); + ABSL_CONST_INIT const absl::string_view kFlexOpNamePrefix = "Flex"; // Use initial buffer size in flatbuffer builder to be same as the initial size @@ -328,13 +334,17 @@ class Translator { static Optional Translate(ModuleOp module, bool emit_builtin_tflite_ops, bool emit_select_tf_ops, - bool emit_custom_ops); + bool emit_custom_ops, + bool strip_debug_info); private: enum class OpType : char { kTfliteBuiltin, kSelectTf, kCustomOp }; explicit Translator(ModuleOp module, bool emit_builtin_tflite_ops, - bool emit_select_tf_ops, bool emit_custom_ops) - : module_(module), builder_(kInitialBufferSize) { + bool emit_select_tf_ops, bool emit_custom_ops, + bool strip_debug_info) + : module_(module), + builder_(kInitialBufferSize), + strip_debug_info_(strip_debug_info) { // The first buffer must be empty according to the schema definition. empty_buffer_ = tflite::CreateBuffer(builder_); buffers_.push_back(empty_buffer_); @@ -437,9 +447,15 @@ class Translator { // Suffix used to generate unique tensor names from operation names. int name_counter_ = 0; + + // Whether to strip or not emit debug info. + const bool strip_debug_info_; }; std::string Translator::GetName(Operation* inst) { + // If strip_debug_info_ is set, then simply return counter value. + if (strip_debug_info_) return Twine(name_counter_++).str(); + if (auto name_loc = inst->getLoc().dyn_cast()) return name_loc.getName().str(); @@ -461,7 +477,7 @@ std::string Translator::UniqueName(llvm::StringRef prefix) { int64_t& prefix_count = name_to_count_[name]; int64_t val = prefix_count; while (val != 0) { - name = (prefix + llvm::Twine(prefix_count)).str(); + name = (prefix + Twine(prefix_count)).str(); ++prefix_count; val = name_to_count_[name]; } @@ -949,10 +965,11 @@ Optional> Translator::BuildSubGraph(FuncOp fn) { Optional Translator::Translate(ModuleOp module, bool emit_builtin_tflite_ops, bool emit_select_tf_ops, - bool emit_custom_ops) { + bool emit_custom_ops, + bool strip_debug_info) { if (!IsValidTFLiteMlirModule(module)) return llvm::None; Translator translator(module, emit_builtin_tflite_ops, emit_select_tf_ops, - emit_custom_ops); + emit_custom_ops, strip_debug_info); return translator.TranslateInternal(); } @@ -1014,8 +1031,9 @@ bool tflite::MlirToFlatBufferTranslateFunction( ModuleOp module, std::string* serialized_flatbuffer, bool emit_builtin_tflite_ops, bool emit_select_tf_ops, bool emit_custom_ops) { - auto maybe_translated = Translator::Translate( - module, emit_builtin_tflite_ops, emit_select_tf_ops, emit_custom_ops); + auto maybe_translated = + Translator::Translate(module, emit_builtin_tflite_ops, emit_select_tf_ops, + emit_custom_ops, strip_debug_info_flag); if (!maybe_translated) return true; *serialized_flatbuffer = std::move(*maybe_translated); return false; diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_translate.h b/tensorflow/compiler/mlir/lite/flatbuffer_translate.h index 820b2697e43..f8996d2c124 100644 --- a/tensorflow/compiler/mlir/lite/flatbuffer_translate.h +++ b/tensorflow/compiler/mlir/lite/flatbuffer_translate.h @@ -27,6 +27,8 @@ extern bool emit_select_tf_ops; extern bool emit_custom_ops; // The flag to control whether to lower tensorlist ops into TF ops. extern bool lower_tensor_list_ops; +// The flag to control whether debug info gets stripped on export. +extern bool strip_debug_info; namespace tflite { diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir index eb9119d1c46..43ee98934e0 100644 --- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir +++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/simple.mlir @@ -1,4 +1,5 @@ // RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s +// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - -strip-debug-info | flatbuffer_to_string - | FileCheck --dump-input-on-failure %s --check-prefix=STRIP func @main(tensor<3x2xi32>) -> tensor<3x2xi32> attributes {tf.entry_function = {inputs = "input", outputs = "SameNameAsOutput"}} { @@ -16,6 +17,8 @@ func @main(tensor<3x2xi32>) -> tensor<3x2xi32> // CHECK-NEXT: type: INT32, // CHECK-NEXT: buffer: 1, // CHECK-NEXT: name: "input", +// STRIP: buffer: 1, +// STRIP-NEXT: name: "input", // CHECK-NEXT: quantization: { // CHECK-EMPTY: // CHECK-NEXT: } @@ -24,6 +27,8 @@ func @main(tensor<3x2xi32>) -> tensor<3x2xi32> // CHECK-NEXT: type: INT32, // CHECK-NEXT: buffer: 2, // CHECK-NEXT: name: "Const", +// STRIP: buffer: 2, +// STRIP-NEXT: name: "0", // CHECK-NEXT: quantization: { // CHECK-EMPTY: // CHECK-NEXT: } @@ -32,6 +37,8 @@ func @main(tensor<3x2xi32>) -> tensor<3x2xi32> // CHECK-NEXT: type: INT32, // CHECK-NEXT: buffer: 3, // CHECK-NEXT: name: "sub", +// STRIP: buffer: 3, +// STRIP-NEXT: name: "1", // CHECK-NEXT: quantization: { // CHECK-EMPTY: // CHECK-NEXT: } @@ -40,6 +47,8 @@ func @main(tensor<3x2xi32>) -> tensor<3x2xi32> // CHECK-NEXT: type: INT32, // CHECK-NEXT: buffer: 4, // CHECK-NEXT: name: "SameNameAsOutput1", +// STRIP: buffer: 4, +// STRIP-NEXT: name: "2", // CHECK-NEXT: quantization: { // CHECK-EMPTY: // CHECK-NEXT: } @@ -48,6 +57,8 @@ func @main(tensor<3x2xi32>) -> tensor<3x2xi32> // CHECK-NEXT: type: INT32, // CHECK-NEXT: buffer: 5, // CHECK-NEXT: name: "SameNameAsOutput", +// STRIP: buffer: 5, +// STRIP-NEXT: name: "SameNameAsOutput", // CHECK-NEXT: quantization: { // CHECK-EMPTY: // CHECK-NEXT: } From 4d5a80be52a638bd4537e8f984a6d3df9936ceb8 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Wed, 24 Jul 2019 09:05:59 -0700 Subject: [PATCH 0460/3053] Simplify test for importing GraphDef with a custom operation This makes the test shorter and focused exactly on what it is supposed to test. PiperOrigin-RevId: 259751931 --- .../graph-custom-operation.pbtxt | 2168 +---------------- 1 file changed, 18 insertions(+), 2150 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt index 82146716fff..83c1d2dc15c 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt @@ -1,209 +1,8 @@ # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s node { - name: "Placeholder" - op: "Placeholder" - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "shape" - value { - shape { - unknown_rank: true - } - } - } -} -node { - name: "Placeholder_1" - op: "Placeholder" - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "shape" - value { - shape { - unknown_rank: true - } - } - } -} -node { - name: "input0" - op: "TPUReplicatedInput" - input: "Placeholder" - attr { - key: "N" - value { - i: 1 - } - } - attr { - key: "T" - value { - type: DT_FLOAT - } - } -} -node { - name: "input1" - op: "TPUReplicatedInput" - input: "Placeholder_1" - attr { - key: "N" - value { - i: 1 - } - } - attr { - key: "T" - value { - type: DT_FLOAT - } - } -} -node { - name: "cluster/pivot" - op: "NoOp" -} -node { - name: "TPUReplicateMetadata" - op: "TPUReplicateMetadata" - input: "^cluster/pivot" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "computation_shape" - value { - list { - } - } - } - attr { - key: "device_assignment" - value { - list { - } - } - } - attr { - key: "host_compute_core" - value { - list { - } - } - } - attr { - key: "num_cores_per_replica" - value { - i: 1 - } - } - attr { - key: "num_replicas" - value { - i: 1 - } - } - attr { - key: "topology" - value { - s: "" - } - } - attr { - key: "use_tpu" - value { - b: true - } - } -} -node { - name: "replicated_input_0" - op: "Identity" - input: "input0" - input: "^TPUReplicateMetadata" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "replicated_input_1" - op: "Identity" - input: "input1" - input: "^TPUReplicateMetadata" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/maximum_iterations" + name: "Constant" op: "Const" - input: "^TPUReplicateMetadata" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 10 - } - } - } -} -node { - name: "while/iteration_counter" - op: "Const" - input: "^TPUReplicateMetadata" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } attr { key: "dtype" value { @@ -223,1968 +22,37 @@ node { } } node { - name: "while/Enter" - op: "Enter" - input: "while/iteration_counter" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "while/while_context" - } - } - attr { - key: "is_constant" - value { - b: false - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "while/Enter_1" - op: "Enter" - input: "replicated_input_0" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "while/while_context" - } - } - attr { - key: "is_constant" - value { - b: false - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "while/Enter_2" - op: "Enter" - input: "replicated_input_1" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "while/while_context" - } - } - attr { - key: "is_constant" - value { - b: false - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "while/Merge" - op: "Merge" - input: "while/Enter" - input: "while/NextIteration" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Merge_1" - op: "Merge" - input: "while/Enter_1" - input: "while/NextIteration_1" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Merge_2" - op: "Merge" - input: "while/Enter_2" - input: "while/NextIteration_2" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Less/Enter" - op: "Enter" - input: "while/maximum_iterations" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "while/while_context" - } - } - attr { - key: "is_constant" - value { - b: true - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "while/Less" - op: "Less" - input: "while/Merge" - input: "while/Less/Enter" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/less_than_5_If8q4vKg9jA" - op: "less_than_5_If8q4vKg9jA" - input: "while/Merge_1" - input: "^while/Merge" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/LogicalAnd" - op: "LogicalAnd" - input: "while/Less" - input: "while/less_than_5_If8q4vKg9jA" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/LoopCond" - op: "LoopCond" - input: "while/LogicalAnd" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Switch" - op: "Switch" - input: "while/Merge" - input: "while/LoopCond" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_class" - value { - list { - s: "loc:@while/Merge" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Switch_1" - op: "Switch" - input: "while/Merge_1" - input: "while/LoopCond" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@while/Merge_1" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Switch_2" - op: "Switch" - input: "while/Merge_2" - input: "while/LoopCond" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@while/Merge_2" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Identity" - op: "Identity" - input: "while/Switch:1" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Identity_1" - op: "Identity" - input: "while/Switch_1:1" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Identity_2" - op: "Identity" - input: "while/Switch_2:1" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/add/y" - op: "Const" - input: "^while/Identity" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 1 - } - } - } -} -node { - name: "while/add" - op: "Add" - input: "while/Identity" - input: "while/add/y" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/add_1/y" - op: "Const" - input: "^while/Identity" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_FLOAT - tensor_shape { - } - float_val: 1 - } - } - } -} -node { - name: "while/add_1" - op: "Add" - input: "while/Identity_1" - input: "while/add_1/y" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/mul_2_Da30D05wlPU" - op: "mul_2_Da30D05wlPU" - input: "while/Identity_1" - input: "while/Identity_2" - input: "^while/Identity" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/NextIteration" - op: "NextIteration" - input: "while/add" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/NextIteration_1" - op: "NextIteration" - input: "while/add_1" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/NextIteration_2" - op: "NextIteration" - input: "while/mul_2_Da30D05wlPU" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Exit" - op: "Exit" - input: "while/Switch" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Exit_1" - op: "Exit" - input: "while/Switch_1" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "while/Exit_2" - op: "Exit" - input: "while/Switch_2" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/Shape" - op: "Shape" - input: "while/Exit_2" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "out_type" - value { - type: DT_INT32 - } - } -} -node { - name: "gradients/grad_ys_0" - op: "Const" - input: "^TPUReplicateMetadata" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_FLOAT - tensor_shape { - } - float_val: 1 - } - } - } -} -node { - name: "gradients/Fill" - op: "Fill" - input: "gradients/Shape" - input: "gradients/grad_ys_0" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "index_type" - value { - type: DT_INT32 - } - } -} -node { - name: "gradients/f_count" - op: "Const" - input: "^TPUReplicateMetadata" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 0 - } - } - } -} -node { - name: "gradients/f_count_1" - op: "Enter" - input: "gradients/f_count" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "while/while_context" - } - } - attr { - key: "is_constant" - value { - b: false - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/Merge" - op: "Merge" - input: "gradients/f_count_1" - input: "gradients/NextIteration" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/Switch" - op: "Switch" - input: "gradients/Merge" - input: "while/LoopCond" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/Add/y" - op: "Const" - input: "^while/Identity" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 1 - } - } - } -} -node { - name: "gradients/Add" - op: "Add" - input: "gradients/Switch:1" - input: "gradients/Add/y" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/f_count_2" - op: "Exit" - input: "gradients/Switch" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/b_count" - op: "Const" - input: "^TPUReplicateMetadata" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 1 - } - } - } -} -node { - name: "gradients/b_count_1" - op: "Enter" - input: "gradients/f_count_2" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "gradients/while/while_context" - } - } - attr { - key: "is_constant" - value { - b: false - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/Merge_1" - op: "Merge" - input: "gradients/b_count_1" - input: "gradients/NextIteration_1" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/GreaterEqual/Enter" - op: "Enter" - input: "gradients/b_count" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "gradients/while/while_context" - } - } - attr { - key: "is_constant" - value { - b: true - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/GreaterEqual" - op: "GreaterEqual" - input: "gradients/Merge_1" - input: "gradients/GreaterEqual/Enter" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/b_count_2" - op: "LoopCond" - input: "gradients/GreaterEqual" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/Switch_1" - op: "Switch" - input: "gradients/Merge_1" - input: "gradients/b_count_2" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/Sub" - op: "Sub" - input: "gradients/Switch_1:1" - input: "gradients/GreaterEqual/Enter" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/b_count_3" - op: "Exit" - input: "gradients/Switch_1" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/zeros_like" - op: "ZerosLike" - input: "while/Exit_1" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/Exit_2_grad/b_exit" - op: "Enter" - input: "gradients/Fill" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "gradients/while/while_context" - } - } - attr { - key: "is_constant" - value { - b: false - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/while/Exit_1_grad/b_exit" - op: "Enter" - input: "gradients/zeros_like" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "gradients/while/while_context" - } - } - attr { - key: "is_constant" - value { - b: false - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/while/Switch_2_grad/b_switch" - op: "Merge" - input: "gradients/while/Exit_2_grad/b_exit" - input: "gradients/while/Switch_2_grad_1/NextIteration" - attr { - key: "N" - value { - i: 2 - } - } - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/Merge_2_grad/Switch" - op: "Switch" - input: "gradients/while/Switch_2_grad/b_switch" - input: "gradients/b_count_2" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_class" - value { - list { - s: "loc:@gradients/while/Switch_2_grad/b_switch" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/Enter_2_grad/Exit" - op: "Exit" - input: "gradients/while/Merge_2_grad/Switch" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const" - op: "Const" - input: "^cluster/pivot" - attr { - key: "_class" - value { - list { - s: "loc:@while/Identity_1" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 1 - } - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul" - op: "Mul" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const" - input: "while/maximum_iterations" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_class" - value { - list { - s: "loc:@while/Identity_1" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc" - op: "StackV2" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul" - attr { - key: "_class" - value { - list { - s: "loc:@while/Identity_1" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "elem_type" - value { - type: DT_FLOAT - } - } - attr { - key: "stack_name" - value { - s: "" - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter" - op: "Enter" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc" - attr { - key: "T" - value { - type: DT_RESOURCE - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "while/while_context" - } - } - attr { - key: "is_constant" - value { - b: true - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2" - op: "StackPushV2" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter" - input: "while/Identity_1" - input: "^gradients/Add" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "swap_memory" - value { - b: false - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2/Enter" - op: "Enter" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc" - attr { - key: "T" - value { - type: DT_RESOURCE - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "gradients/while/while_context" - } - } - attr { - key: "is_constant" - value { - b: true - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2" - op: "StackPopV2" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2/Enter" - input: "^gradients/Sub" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "elem_type" - value { - type: DT_FLOAT - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const_1" - op: "Const" - input: "^cluster/pivot" - attr { - key: "_class" - value { - list { - s: "loc:@while/Identity_2" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "dtype" - value { - type: DT_INT32 - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_INT32 - tensor_shape { - } - int_val: 1 - } - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul_1" - op: "Mul" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Const_1" - input: "while/maximum_iterations" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_class" - value { - list { - s: "loc:@while/Identity_2" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc_1" - op: "StackV2" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/mul_1" - attr { - key: "_class" - value { - list { - s: "loc:@while/Identity_2" - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "elem_type" - value { - type: DT_FLOAT - } - } - attr { - key: "stack_name" - value { - s: "" - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter_1" - op: "Enter" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc_1" - attr { - key: "T" - value { - type: DT_RESOURCE - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "while/while_context" - } - } - attr { - key: "is_constant" - value { - b: true - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2_1" - op: "StackPushV2" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/Enter_1" - input: "while/Identity_2" - input: "^gradients/Add" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "swap_memory" - value { - b: false - } - } -} -node { - name: "gradients/NextIteration" - op: "NextIteration" - input: "gradients/Add" - input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2" - input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPushV2_1" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1/Enter" - op: "Enter" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/f_acc_1" - attr { - key: "T" - value { - type: DT_RESOURCE - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "frame_name" - value { - s: "gradients/while/while_context" - } - } - attr { - key: "is_constant" - value { - b: true - } - } - attr { - key: "parallel_iterations" - value { - i: 10 - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1" - op: "StackPopV2" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1/Enter" - input: "^gradients/Sub" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "elem_type" - value { - type: DT_FLOAT - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient" - op: "SymbolicGradient" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1" - input: "gradients/while/Merge_2_grad/Switch:1" - input: "^gradients/Sub" - attr { - key: "Tin" - value { - list { - type: DT_FLOAT - type: DT_FLOAT - type: DT_FLOAT - } - } - } - attr { - key: "Tout" - value { - list { - type: DT_FLOAT - type: DT_FLOAT - } - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - attr { - key: "f" - value { - func { - name: "mul_2_Da30D05wlPU" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } - } - } - } -} -node { - name: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync" - op: "ControlTrigger" - input: "^cluster/pivot" - input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2" - input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/StackPopV2_1" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/NextIteration_1" - op: "NextIteration" - input: "gradients/Sub" - input: "^gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync" - attr { - key: "T" - value { - type: DT_INT32 - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "gradients/while/Switch_2_grad_1/NextIteration" - op: "NextIteration" - input: "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient:1" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "NoOp" - op: "NoOp" - input: "^cluster/pivot" - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "Identity" - op: "Identity" - input: "gradients/while/Enter_2_grad/Exit" - device: "/device:TPU_REPLICATED_CORE:0" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "_tpu_replicate" - value { - s: "cluster" - } - } -} -node { - name: "output0" - op: "TPUReplicatedOutput" - input: "Identity" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - attr { - key: "num_replicas" - value { - i: 1 - } - } -} -node { - name: "TPUCompilationResult" - op: "TPUCompilationResult" - input: "^TPUReplicateMetadata" - attr { - key: "_tpu_compilation_status" - value { - s: "cluster" - } - } -} -node { - name: "output_0_shard_0" - op: "Identity" - input: "output0" - input: "^NoOp" - attr { - key: "T" - value { - type: DT_FLOAT - } - } -} -node { - name: "ConfigureDistributedTPU" - op: "ConfigureDistributedTPU" - device: "/device:TPU_SYSTEM:0" - attr { - key: "embedding_config" - value { - s: "" - } - } - attr { - key: "is_global_init" - value { - b: false - } - } - attr { - key: "tpu_embedding_config" - value { - s: "" - } - } + name: "_tf.foo" + op: "foo" + input: "Constant" } library { function { signature { - name: "mul_2_Da30D05wlPU" + name: "foo" input_arg { - name: "mul_2_da30d05wlpu" - type: DT_FLOAT - } - input_arg { - name: "mul_2_da30d05wlpu1" - type: DT_FLOAT + name: "arg" + type: DT_INT32 } output_arg { - name: "mul_2_da30d05wlpu2" - type: DT_FLOAT - } - } - node_def { - name: "mul/y" - op: "Const" - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_FLOAT - tensor_shape { - dim { - size: 1 - } - dim { - size: 1 - } - } - float_val: 2 - } - } - } - } - node_def { - name: "mul_0" - op: "Mul" - input: "mul_2_da30d05wlpu1" - input: "mul/y:output:0" - attr { - key: "T" - value { - type: DT_FLOAT - } + name: "return_value" + type: DT_INT32 } } ret { - key: "mul_2_da30d05wlpu2" - value: "mul_0:z:0" - } - attr { - key: "_noinline" - value { - b: true - } - } - } - function { - signature { - name: "less_than_5_If8q4vKg9jA" - input_arg { - name: "less_than_5_if8q4vkg9ja" - type: DT_FLOAT - } - output_arg { - name: "less_than_5_if8q4vkg9ja1" - type: DT_BOOL - } - } - node_def { - name: "Less/y" - op: "Const" - attr { - key: "dtype" - value { - type: DT_FLOAT - } - } - attr { - key: "value" - value { - tensor { - dtype: DT_FLOAT - tensor_shape { - } - float_val: 5 - } - } - } - } - node_def { - name: "Less" - op: "Less" - input: "less_than_5_if8q4vkg9ja" - input: "Less/y:output:0" - attr { - key: "T" - value { - type: DT_FLOAT - } - } - } - ret { - key: "less_than_5_if8q4vkg9ja1" - value: "Less:z:0" - } - attr { - key: "_noinline" - value { - b: true - } + key: "return_value" + value: "arg" } } } versions { - producer: 27 + producer: 62 min_consumer: 12 } -# CHECK: func @main() { -# CHECK: %30:2 = "_tf.less_than_5_If8q4vKg9jA0"(%23#0, %29#2) {_tpu_replicate = "cluster", device = "", name = "while/less_than_5_If8q4vKg9jA"} : (tensor<*xf32>, !_tf.control) -> (tensor<*xi1>, !_tf.control) -# CHECK: %73:2 = "_tf.mul_2_Da30D05wlPU0"(%58#0, %72#0, %47#1) {_tpu_replicate = "cluster", device = "", name = "while/mul_2_Da30D05wlPU"} : (tensor<*xf32>, tensor<*xf32>, !_tf.control) -> (tensor<*xf32>, !_tf.control) -# CHECK: return -# CHECK-NEXT: } -# CHECK: func @less_than_5_If8q4vKg9jA0(%arg0: tensor<*xf32>) -> tensor<*xi1> -# CHECK-NEXT: attributes {tf._noinline = true} { -# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Less/y", value = dense<5.000000e+00> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Less"(%arg0, %0#0) {T = "tfdtype$DT_FLOAT", device = "", name = "Less"} : (tensor<*xf32>, tensor) -> (tensor<*xi1>, !_tf.control) -# CHECK-NEXT: return %1#0 : tensor<*xi1> -# CHECK-NEXT: } -# CHECK: func @mul_2_Da30D05wlPU0(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> -# CHECK-NEXT: attributes {tf._noinline = true} { -# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "mul/y", value = dense<2.000000e+00> : tensor<1x1xf32>} : () -> (tensor<1x1xf32>, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Mul"(%arg1, %0#0) {T = "tfdtype$DT_FLOAT", device = "", name = "mul_0"} : (tensor<*xf32>, tensor<1x1xf32>) -> (tensor<*xf32>, !_tf.control) -# CHECK-NEXT: return %1#0 : tensor<*xf32> -# CHECK-NEXT: } +# Verify that we can import a custom operation that maps to a function and that +# the names are matching between the function definition and the uses / call +# site (a numerical suffix may be appended). + +# CHECK: "_tf.foo0"( +# CHECK: func @foo0 From 95995634b546286a4b393c43ca8a848e461964b3 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Wed, 24 Jul 2019 09:21:57 -0700 Subject: [PATCH 0461/3053] Fix missing namespace: int64 is in tf namespace. Alternative is to include tensorflow/compiler/xla/types.h but I preferred this as its more targeted in the header file. PiperOrigin-RevId: 259754662 --- tensorflow/compiler/mlir/xla/hlo_function_importer.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.h b/tensorflow/compiler/mlir/xla/hlo_function_importer.h index c1f091a08cd..13671dd0310 100644 --- a/tensorflow/compiler/mlir/xla/hlo_function_importer.h +++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.h @@ -89,7 +89,8 @@ class HloFunctionImporter { xla::HloInstruction* instruction); // Converts the dimensions of an HLO instruction into an MLIR attribute. - mlir::ElementsAttr ConvertDimensions(llvm::ArrayRef op_dimensions); + mlir::ElementsAttr ConvertDimensions( + llvm::ArrayRef op_dimensions); // Converts Array ref to an ElementsAttr. mlir::ElementsAttr Convert(llvm::ArrayRef op_dimensions); From af0d72f709c142194d29753e5dbeea3cb6cd9ea9 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Wed, 24 Jul 2019 09:26:46 -0700 Subject: [PATCH 0462/3053] Simplify the graphdef2mlir/graph-func-attr.pbtxt test to be more targeted This test is intended to check that NameAttrList are properly imported. The CHECK lines are updated to assert this and only this. PiperOrigin-RevId: 259755466 --- .../tests/graphdef2mlir/graph-func-attr.pbtxt | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-func-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-func-attr.pbtxt index e8b9ce86ddb..0176edb4b21 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-func-attr.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-func-attr.pbtxt @@ -1,5 +1,13 @@ # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s +# CHECK-LABEL: func @main() { + +# Verify that the NameAttrList is properly turned into reference to functions on import +# CHECK: tf.Case +# CHECK-SAME: branches = [@[[FOO:[a-z0-9]+]], @[[BAR:[a-z0-9]+]]] +# CHECK-DAG: func @[[FOO]]() +# CHECK-DAG: func @[[BAR]]() + node { name: "predicate" op: "Const" @@ -152,16 +160,3 @@ versions { min_consumer: 12 } -# CHECK: func @main() { -# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "predicate", value = dense<0> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Case"(%0#0) {Tin = [], Tout = ["tfdtype$DT_FLOAT"], branches = [@foo0, @bar0], device = "", name = "Case", output_shapes = []} : (tensor) -> (tensor<*xf32>, !_tf.control) -# CHECK-NEXT: return -# CHECK-NEXT: } -# CHECK: func @foo0() -> tensor<10xf32> { -# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "const_1", value = dense<1.000000e+00> : tensor<10xf32>} : () -> (tensor<10xf32>, !_tf.control) -# CHECK-NEXT: return %0#0 : tensor<10xf32> -# CHECK-NEXT: } -# CHECK: func @bar0() -> tensor<10xf32> { -# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "const_2", value = dense<2.000000e+00> : tensor<10xf32>} : () -> (tensor<10xf32>, !_tf.control) -# CHECK-NEXT: return %0#0 : tensor<10xf32> -# CHECK-NEXT: } From f286d1697069ba48404044574344b51a60792098 Mon Sep 17 00:00:00 2001 From: "Patrick J. LoPresti" Date: Wed, 24 Jul 2019 09:39:50 -0700 Subject: [PATCH 0463/3053] Propagate intra_op_parallelism_threads from SessionOptions to xla::LocalClientOptions. With CPU Tensorflow, when we set intra_op_parallelism_threads to 1 in the SessionOptions config, the XLA CPU backend still spawns a huge number of threads because XlaDevice does not propagate this option when it creates an xla::LocalClient. Fix is a fairly simple. --- tensorflow/compiler/jit/xla_device.cc | 7 ++++++- tensorflow/compiler/jit/xla_device.h | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc index 1d8b4beb8bd..16c1e16330c 100644 --- a/tensorflow/compiler/jit/xla_device.cc +++ b/tensorflow/compiler/jit/xla_device.cc @@ -203,6 +203,7 @@ XlaDevice::XlaDevice(const SessionOptions& session_options, device_ordinal_(options.device_ordinal), jit_device_name_(options.compilation_device_name), platform_(options.platform), + intra_op_parallelism_threads_(session_options.config.intra_op_parallelism_threads()), use_multiple_streams_(options.use_multiple_streams), shape_representation_fn_(options.shape_representation_fn), allowed_devices_(options.allowed_devices) { @@ -233,9 +234,13 @@ xla::LocalClient* XlaDevice::client() const { // don't want to do it until we get a chance to hook the platform up // to a simulator. + xla::LocalClientOptions options; + options.set_platform(platform_) + .set_allowed_devices(allowed_devices_) + .set_intra_op_parallelism_threads(intra_op_parallelism_threads_); // TODO(b/78468222): This can fail, at least when the backend is GPU and // there is no GPU on the host. - return xla::ClientLibrary::GetOrCreateLocalClient(platform_, allowed_devices_) + return xla::ClientLibrary::GetOrCreateLocalClient(options) .ValueOrDie(); } diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h index 51910c6fabc..877580e73f9 100644 --- a/tensorflow/compiler/jit/xla_device.h +++ b/tensorflow/compiler/jit/xla_device.h @@ -202,6 +202,8 @@ class XlaDevice : public LocalDevice { const DeviceType jit_device_name_; // The platform for this device. se::Platform* const platform_; // Not owned. + // Intra-op threads to spawn (from SessionOptions). + const int intra_op_parallelism_threads_; // Memory allocator associated with this device. Allocator* xla_allocator_ GUARDED_BY(mu_) = nullptr; // Not owned. From 71b3131a5427ff9679593c580e4d1d9319976ea1 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 24 Jul 2019 16:42:29 +0000 Subject: [PATCH 0464/3053] Use lambda to switch between file_io.FileIO and gzip.open bazed on review feedback Signed-off-by: Yong Tang --- .../python/data/experimental/ops/readers.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py index ae20b5e1cd7..191226fd2ee 100644 --- a/tensorflow/python/data/experimental/ops/readers.py +++ b/tensorflow/python/data/experimental/ops/readers.py @@ -114,7 +114,7 @@ def _next_csv_row( filenames, num_cols, field_delim, use_quote_delim, header, file_io_fn): """Generator that yields rows of CSV file(s) in order.""" for fn in filenames: - with file_io_fn(fn, "r") as f: + with file_io_fn(fn) as f: rdr = csv.reader( f, delimiter=field_delim, @@ -164,7 +164,7 @@ def _infer_column_names(filenames, field_delim, use_quote_delim, file_io_fn): "delimiter": field_delim, "quoting": csv.QUOTE_MINIMAL if use_quote_delim else csv.QUOTE_NONE } - with file_io_fn(filenames[0], "r") as f: + with file_io_fn(filenames[0]) as f: try: column_names = next(csv.reader(f, **csv_kwargs)) except StopIteration: @@ -172,7 +172,7 @@ def _infer_column_names(filenames, field_delim, use_quote_delim, file_io_fn): "of %s. Empty file?") % filenames[0]) for name in filenames[1:]: - with file_io_fn(name, "r") as f: + with file_io_fn(name) as f: try: if next(csv.reader(f, **csv_kwargs)) != column_names: raise ValueError( @@ -431,21 +431,15 @@ def make_csv_dataset_v2( dataset = dataset.shuffle(len(filenames), shuffle_seed) # Clean arguments; figure out column names and defaults - def gzip_file_io_open(filename, mode): - # By default, gzip will open in byte mode which will - # not work with csv.reader so we create a wrapper to - # append `t`. - mode = mode + "t" if "t" not in mode else mode - return gzip.open(filename, mode) if column_names is None or column_defaults is None: # Find out which io function to open the file - file_io_fn = file_io.FileIO + file_io_fn = lambda filename: file_io.FileIO(filename, 'r') if compression_type is not None: compression_type_value = tensor_util.constant_value(compression_type) if compression_type_value is None: raise ValueError("Received unkown compression_type") if compression_type_value == "GZIP": - file_io_fn = gzip_file_io_open + file_io_fn = lambda filename: gzip.open(filename, 'rt') elif compression_type_value == "ZLIB": raise ValueError( "compression_type (%s) is not supported for probing columns" % From 2d10cc585fbc151c3dd67579d6a5e2842803ebc3 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Wed, 24 Jul 2019 09:41:55 -0700 Subject: [PATCH 0465/3053] Add missing header for cord. PiperOrigin-RevId: 259758313 --- tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc index e872ab3f1fb..380d1253370 100644 --- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc +++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc @@ -35,6 +35,7 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.pb.h" #include "tensorflow/core/framework/types.pb.h" #include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/cord.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/stream_executor/lib/statusor.h" From 73243a836c7f850225ee53eeac69962e78b157ab Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Wed, 24 Jul 2019 09:44:24 -0700 Subject: [PATCH 0466/3053] Simplify graphdef2mlir/graph-function-defs.pbtxt test to be more targeted This test intends to check that we properly import call site function attributes. The CHECK lines are updated to reflect this. PiperOrigin-RevId: 259758782 --- .../graphdef2mlir/graph-function-defs.pbtxt | 39 +++---------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt index 249a1efa952..6a2a411d115 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-defs.pbtxt @@ -1,5 +1,11 @@ # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s +# Verify that we properly import call site function attributes. +# CHECK: tf.If +# CHECK-SAME: then_branch = @ +# CHECK-SAME: then_branch.how_many = 32 +# CHECK-SAME: then_branch.ping = "ack" + node { name: "Placeholder" op: "Placeholder" @@ -503,36 +509,3 @@ versions { producer: 27 min_consumer: 12 } - -# CHECK: func @main() { -# CHECK-NEXT: %0:2 = "_tf.ConfigureDistributedTPU"() {device = "/device:TPU_SYSTEM:0", embedding_config = "", is_global_init = false, name = "ConfigureDistributedTPU", tpu_embedding_config = ""} : () -> (tensor<*x!tf.string>, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Placeholder"() {device = "", dtype = "tfdtype$DT_INT32", name = "Placeholder", shape = "tfshape$unknown_rank: true\0A"} : () -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %2:2 = "_tf.TPUReplicatedInput"(%1#0) {N = 1 : i64, T = "tfdtype$DT_INT32", device = "", name = "input0"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %3:2 = "_tf.Placeholder"() {device = "", dtype = "tfdtype$DT_INT32", name = "Placeholder_1", shape = "tfshape$unknown_rank: true\0A"} : () -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %4:2 = "_tf.TPUReplicatedInput"(%3#0) {N = 1 : i64, T = "tfdtype$DT_INT32", device = "", name = "input1"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %5 = "_tf.NoOp"() {device = "", name = "cluster/pivot"} : () -> !_tf.control -# CHECK-NEXT: %6 = "_tf.NoOp"(%5) {_tpu_replicate = "cluster", device = "", name = "NoOp"} : (!_tf.control) -> !_tf.control -# CHECK-NEXT: %7 = "_tf.TPUReplicateMetadata"(%5) {_tpu_replicate = "cluster", allow_soft_placement = false, computation_shape = [], device = "", device_assignment = [], host_compute_core = [], name = "TPUReplicateMetadata", num_cores_per_replica = 1 : i64, num_replicas = 1 : i64, padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", use_tpu = true} : (!_tf.control) -> !_tf.control -# CHECK-NEXT: %8:2 = "_tf.TPUCompilationResult"(%7) {_tpu_compilation_status = "cluster", device = "", name = "TPUCompilationResult"} : (!_tf.control) -> (tensor, !_tf.control) -# CHECK-NEXT: %9:2 = "_tf.Identity"(%2#0, %7) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "", name = "replicated_input_0"} : (tensor<*xi32>, !_tf.control) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %10:2 = "_tf.Identity"(%4#0, %7) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "", name = "replicated_input_1"} : (tensor<*xi32>, !_tf.control) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %11:2 = "_tf.Less"(%9#0, %10#0) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "", name = "Less"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi1>, !_tf.control) -# CHECK-NEXT: %12:3 = "_tf.If"(%11#0, %10#0, %9#0) {Tcond = "tfdtype$DT_BOOL", Tin = ["tfdtype$DT_INT32", "tfdtype$DT_INT32"], Tout = ["tfdtype$DT_INT32", "tfdtype$DT_INT32"], _tpu_replicate = "cluster", device = "", else_branch = @cond_false0, is_stateless = false, name = "cond", output_shapes = ["tfshape$unknown_rank: true\0A", "tfshape$unknown_rank: true\0A"], then_branch = @cond_true0, then_branch.how_many = 32 : i64, then_branch.ping = "ack"} : (tensor<*xi1>, tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %13:2 = "_tf.Identity"(%12#0) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "/device:TPU_REPLICATED_CORE:0", name = "Identity"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %14:2 = "_tf.TPUReplicatedOutput"(%13#0) {T = "tfdtype$DT_INT32", device = "", name = "output0", num_replicas = 1 : i64} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %15:2 = "_tf.Identity"(%14#0, %6) {T = "tfdtype$DT_INT32", device = "", name = "output_0_shard_0"} : (tensor<*xi32>, !_tf.control) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %16:2 = "_tf.Identity"(%12#1) {T = "tfdtype$DT_INT32", _tpu_replicate = "cluster", device = "/device:TPU_REPLICATED_CORE:0", name = "Identity_1"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %17:2 = "_tf.TPUReplicatedOutput"(%16#0) {T = "tfdtype$DT_INT32", device = "", name = "output1", num_replicas = 1 : i64} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %18:2 = "_tf.Identity"(%17#0, %6) {T = "tfdtype$DT_INT32", device = "", name = "output_1_shard_0"} : (tensor<*xi32>, !_tf.control) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: return -# CHECK-NEXT: } -# CHECK: func @cond_false0(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>) { -# CHECK-NEXT: %0:2 = "_tf.Identity"(%arg0) {T = "tfdtype$DT_INT32", device = "", name = "Identity"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Identity"(%arg1) {T = "tfdtype$DT_INT32", device = "", name = "Identity_1"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: return %1#0, %0#0 : tensor<*xi32>, tensor<*xi32> -# CHECK-NEXT: } -# CHECK: func @cond_true0(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>) { -# CHECK-NEXT: %0:2 = "_tf.Identity"(%arg0) {T = "tfdtype$DT_INT32", device = "", name = "Identity"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Identity"(%arg1) {T = "tfdtype$DT_INT32", device = "", name = "Identity_1"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: return %0#0, %1#0 : tensor<*xi32>, tensor<*xi32> -# CHECK-NEXT: } From d694c96866c98b238397b75a3c4c7dcf48549e4e Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Wed, 24 Jul 2019 10:01:44 -0700 Subject: [PATCH 0467/3053] Simplify graphdef2mlir/graph-default-attr.pbtxt to be more targeted This test is intended to check that default attributes are added when missing from the GraphDef, the CHECK lines are updated to reflect this. PiperOrigin-RevId: 259762073 --- .../tests/graphdef2mlir/graph-default-attr.pbtxt | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt index 46682ab866e..b26d7e7f2ba 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt @@ -1,7 +1,15 @@ # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s -# CHECK: %3:2 = "_tf.Conv2D"(%2#0, %1#0) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], name = "MobilenetV1/MobilenetV1/Conv2d_0/Conv2D", padding = "SAME", strides = [1, 2, 2, 1], use_cudnn_on_gpu = true} -# CHECK-NEXT: %4:2 = "_tf.MaxPool"(%3#0) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", device = "", ksize = [1, 2, 2, 1], name = "MaxPool", padding = "SAME", strides = [1, 2, 2, 1]} +# Verify that the data_format attributes is pulled from the default value in the +# registry when not present in the GraphDef +# CHECK: tf.Conv2D +# CHECK-SAME: data_format = "NHWC" + +# Verify that we can also pull some attributes that are needed to be able to +# create a Graph in memory, like `T`. +# CHECK: tf.MaxPool +# CHECK-SAME: T = "tfdtype$DT_FLOAT" + node { name: "input" op: "Placeholder" From ff64d8791b2981e3fe9f4b7702a3ec2cbb01870d Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Wed, 24 Jul 2019 10:15:01 -0700 Subject: [PATCH 0468/3053] Simplify graphdef2mlir/graph-empty-tensor-content.pbtxt test This test is intended to verify the tensor_content field on import to MLIR, the CHECK lines are updated to reflect this. PiperOrigin-RevId: 259765098 --- .../tests/graphdef2mlir/graph-empty-tensor-content.pbtxt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt index 441eca84e7e..de56712ca13 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt @@ -1,6 +1,9 @@ # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s -# CHECK: %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Const", value = opaque<"tf", "0x746674656E736F722464747970653A2044545F464C4F41540A74656E736F725F7368617065207B0A202064696D207B0A2020202073697A653A20310A20207D0A7D0A"> : tensor<1xf32>} : () -> (tensor<1xf32>, !_tf.control) +# This test is intended to verify the tensor_content field on import of an empty +# tensor. +# CHECK: tf.Const +# CHECK-SAME: value = opaque<"tf", "0x746674656E736F722464747970653A2044545F464C4F41540A74656E736F725F7368617065207B0A202064696D207B0A2020202073697A653A20310A20207D0A7D0A"> node { name: "Const" From f22a98fcf8855cb252658437f248874bd7602082 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Wed, 24 Jul 2019 10:30:50 -0700 Subject: [PATCH 0469/3053] Simplify graphdef2mlir/graph-device-retval.pbtxt test to be more targeted This tests intend to verify that kDeviceRetOp (triggered by tf.experimental_ints_on_device) is properly handled on import and matched to a return operation. This updates the CHECK lines to reflect this. PiperOrigin-RevId: 259768410 --- .../tests/graphdef2mlir/graph-device-retval.pbtxt | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-device-retval.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-device-retval.pbtxt index fcd0e62ab63..157db7d5331 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-device-retval.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-device-retval.pbtxt @@ -74,6 +74,9 @@ library { } # The attribute "experimental_ints_on_device" and the return type INT32 # ensure that kDeviceRetOp is used instead of kRetOp + # CHECK-LABEL: func @foo + # CHECK: tf.experimental_ints_on_device = true + # CHECK: return %{{.*}} tensor attr { key: "experimental_ints_on_device" value { @@ -87,13 +90,3 @@ versions { min_consumer: 12 } -# CHECK: func @main() { -# CHECK-NEXT: %0:2 = "_tf.PartitionedCall"() {Tin = [], Tout = ["tfdtype$DT_INT32"], config = "", config_proto = "", device = "", executor_type = "", f = @foo0, name = "PartitionedCall"} : () -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: return -# CHECK-NEXT: } -# CHECK: func @foo0() -> tensor -# CHECK-NEXT: attributes {tf.experimental_ints_on_device = true} { -# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Const", value = dense<5> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Identity"(%0#0) {T = "tfdtype$DT_INT32", device = "", name = "Identity"} : (tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: return %1#0 : tensor -# CHECK-NEXT: } From 9d5e7bbd3189c09a9d6c09bc91516eb95e12ba1a Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Wed, 24 Jul 2019 10:44:10 -0700 Subject: [PATCH 0470/3053] TFTRT: Changed segment to graphdef conversion to create arg/ret ops instead of placeholder/identity. --- .../tf2tensorrt/convert/convert_graph.cc | 2 ++ .../tf2tensorrt/convert/convert_nodes.cc | 20 ++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index a6ebebe5a60..71e754af38f 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -541,6 +541,7 @@ Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, // graph is the input graph to be optimized by TRT. GraphConstructorOptions gcopts; TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(gcopts, segment, segment_graph)); + /* std::map io_nodes; int num_inputs = 0; for (auto n : segment_graph->op_nodes()) { @@ -615,6 +616,7 @@ Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, } segment_graph->RemoveNode(node); } + */ return Status::OK(); } diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc index 8419c13a37b..399b69a3dd7 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc @@ -5111,6 +5111,8 @@ Status ConvertSegmentToGraphDef( std::vector* connections, GraphDef* segment_def, string* scope_name) { std::set marker_nodes; + int arg_num = 0; + int ret_num = 0; // Update connection shapes/data types and add corresponding input/output // nodes in the segment graphdef. for (size_t i = 0; i < connections->size(); ++i) { @@ -5150,10 +5152,12 @@ Status ConvertSegmentToGraphDef( } marker_nodes.insert(node_name); auto seg_node = segment_def->add_node(); - NodeDefBuilder builder(node_name, "Placeholder"); + NodeDefBuilder builder(node_name, "_Arg"); auto status = builder.Attr("shape", partial_shape) - .Attr("dtype", dtype) + .Attr("T", dtype) + .Attr("index", arg_num) .Finalize(seg_node); + arg_num++; VLOG(1) << "Constructing input " << node_name << " for the edge " << connection.outside_node_name << ":" << connection.outside_port << " -> " << connection.inside_node_name << ":" @@ -5169,11 +5173,13 @@ Status ConvertSegmentToGraphDef( } marker_nodes.insert(node_name); auto seg_node = segment_def->add_node(); - NodeDefBuilder builder(node_name, "Identity"); + NodeDefBuilder builder(node_name, "_Retval"); auto status = - builder + builder.Attr("T", dtype) + .Attr("index", ret_num) .Input(connection.inside_node_name, connection.inside_port, dtype) .Finalize(seg_node); + ret_num++; VLOG(1) << "Constructing output " << node_name << " for the edge " << connection.inside_node_name << ":" << connection.inside_port << " -> " << connection.outside_node_name << ":" @@ -5197,12 +5203,12 @@ Status ConvertSegmentToGraphDef( if (connection.is_control_edge() || !connection.is_input_edge) continue; auto snode = segment_def->mutable_node(old_to_new_id_map[connection.inside_id]); - const string placeholder_name = + const string arg_name = StrCat(IONamePrefixes::kInputPHName, connection.port_number); VLOG(1) << "Updating " << snode->name() << ":" << connection.inside_port << " from " << snode->input(connection.inside_port) << " to " - << placeholder_name; - snode->set_input(connection.inside_port, placeholder_name); + << arg_name; + snode->set_input(connection.inside_port, arg_name); } std::set subgraph_node_names; for (const Node* node : subgraph_nodes) { From d1305cf106fc461aff05f7a08f1ed365f9ade4f6 Mon Sep 17 00:00:00 2001 From: Gunhan Gulsoy Date: Wed, 24 Jul 2019 10:41:35 -0700 Subject: [PATCH 0471/3053] Disable hdf5_format_test on windows. PiperOrigin-RevId: 259770861 --- tensorflow/python/keras/BUILD | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD index b48d3c86e79..cca09636f22 100755 --- a/tensorflow/python/keras/BUILD +++ b/tensorflow/python/keras/BUILD @@ -1650,6 +1650,9 @@ tf_py_test( "//tensorflow/python:client_testlib", ], shard_count = 4, + tags = [ + "no_windows", + ], ) tf_py_test( From 0e06e45399d0257587d97560c9045e35ca002784 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Wed, 24 Jul 2019 10:45:11 -0700 Subject: [PATCH 0472/3053] Simplify graphdef2mlir/graph-function-static-output.pbtxt test This test is intended to verify that the importer from Graph to MLIR infers properly the return type for library functions. The CHECK lines are updated to reflect this. PiperOrigin-RevId: 259771707 --- .../graph-function-static-output.pbtxt | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-static-output.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-static-output.pbtxt index 3ddbf783d64..e0e60c04865 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-static-output.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-static-output.pbtxt @@ -1,5 +1,9 @@ # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s +# Verify that the return type of the functions is properly inferred +#CHECK: func @get_zeros0(%arg0: tensor<*xi32>) -> tensor<2xi32> +#CHECK: func @identity0(%arg0: tensor<*xi32>) -> tensor<*xi32> + node { name: "Placeholder" op: "Placeholder" @@ -139,16 +143,3 @@ versions { min_consumer: 12 } -#CHECK: func @main() { -#CHECK-NEXT: %0:2 = "_tf.Placeholder"() {device = "", dtype = "tfdtype$DT_BOOL", name = "Placeholder", shape = "tfshape$unknown_rank: true\0A"} : () -> (tensor<*xi1>, !_tf.control) -#CHECK-NEXT: %1:2 = "_tf.Placeholder"() {device = "", dtype = "tfdtype$DT_INT32", name = "Placeholder_1", shape = "tfshape$unknown_rank: true\0A"} : () -> (tensor<*xi32>, !_tf.control) -#CHECK-NEXT: %2:2 = "_tf.If"(%0#0, %1#0) {Tcond = "tfdtype$DT_BOOL", Tin = ["tfdtype$DT_INT32"], Tout = ["tfdtype$DT_INT32"], device = "", else_branch = @get_zeros0, is_stateless = false, name = "If", output_shapes = [], then_branch = @identity0} : (tensor<*xi1>, tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) -#CHECK-NEXT: return -#CHECK-NEXT: } -#CHECK: func @get_zeros0(%arg0: tensor<*xi32>) -> tensor<2xi32> { -#CHECK-NEXT: %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "const", value = dense<[1, 2]> : tensor<2xi32>} : () -> (tensor<2xi32>, !_tf.control) -#CHECK-NEXT: return %0#0 : tensor<2xi32> -#CHECK-NEXT: } -#CHECK: func @identity0(%arg0: tensor<*xi32>) -> tensor<*xi32> { -#CHECK-NEXT: return %arg0 : tensor<*xi32> -#CHECK-NEXT: } From 768a3ae4a8c56c6b4458f9bee064da571509314d Mon Sep 17 00:00:00 2001 From: Feng Liu Date: Wed, 24 Jul 2019 10:55:29 -0700 Subject: [PATCH 0473/3053] Add quantization spec for all the TFL ops All the quantization spec are from https://www.tensorflow.org/lite/performance/quantization_spec And this CL extends it to UINT8, so it matches TOCO's spec. PiperOrigin-RevId: 259773987 --- tensorflow/compiler/mlir/lite/ir/tfl_ops.td | 84 +++++++++------- tensorflow/compiler/mlir/lite/tests/ops.mlir | 2 +- .../mlir/lite/tests/prepare-quantize.mlir | 95 +++++++++++++++++++ .../mlir/lite/utils/quantization_driver.cc | 12 ++- .../mlir/lite/utils/quantization_utils.h | 6 ++ 5 files changed, 165 insertions(+), 34 deletions(-) diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td index 127a86b86ae..298f962d096 100644 --- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td +++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td @@ -572,13 +572,14 @@ def TFL_FullyConnectedOptionsWeightFormatAttr : // TODO(jpienaar): Update post discussion on semantics of FC OP. // TODO(jpienaar): Include more shape verification. -def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [NoSideEffect]> { +def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [ + NoSideEffect, TFL_AccumulatorUniformScale<2, 0, 1>]> { let summary = "Fully connected op"; let arguments = (ins - TensorOf<[F32]>:$input, - TensorOf<[F32]>:$filter, - TFL_TensorOfOrNone<[F32]>:$bias, + TensorOf<[F32, TFL_QI8, TFL_QUI8]>:$input, + TensorOf<[F32, TFL_QI8, TFL_QUI8]>:$filter, + TFL_TensorOfOrNone<[F32, TFL_QI8, TFL_QUI8]>:$bias, TFL_AFAttr:$fused_activation_function, TFL_FullyConnectedOptionsWeightFormatAttr:$weights_format, @@ -587,7 +588,7 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [NoSideEffect]> { // Depending on the weights format, this op can have one or two outputs. let results = (outs - Variadic>:$output + Variadic>:$output ); let hasOptions = 1; @@ -595,6 +596,7 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [NoSideEffect]> { def TFL_GatherOp : TFL_Op<"gather", [ NoSideEffect, + TFL_SameOperandsAndResultsScale, TFL_OperandHasAtleastRank<0, 1>, PredOpTrait<"params and output must have same element type", TCresVTEtIsSameAsOp<0, 0>> @@ -606,7 +608,7 @@ def TFL_GatherOp : TFL_Op<"gather", [ }]; let arguments = (ins - TensorOf<[F32, I8, I32, I64, TFL_Str]>:$params, + TensorOf<[F32, I8, I32, I64, TFL_Str, TFL_QI8, TFL_QUI8]>:$params, TensorOf<[I32, I64]>:$indices, I32Attr:$axis ); @@ -619,7 +621,7 @@ def TFL_GatherOp : TFL_Op<"gather", [ ]; let results = (outs - TensorOf<[F32, I16, I32, I64, TFL_Str]>:$output + TensorOf<[F32, I16, I32, I64, TFL_Str, TFL_QI8, TFL_QUI8]>:$output ); let hasOptions = 1; @@ -644,7 +646,8 @@ def TFL_GatherNdOp : TFL_Op<"gather_nd", [NoSideEffect]> { } // Same type check of lhs and rhs is handled by the Broadcastable trait. -def TFL_LessEqualOp : TFL_Op<"less_equal", [Broadcastable, NoSideEffect]> { +def TFL_LessEqualOp : TFL_Op<"less_equal", [ + Broadcastable, NoSideEffect, TFL_NoQuantizableResult]> { let summary = "Less_equal operator"; let description = [{ @@ -652,8 +655,8 @@ def TFL_LessEqualOp : TFL_Op<"less_equal", [Broadcastable, NoSideEffect]> { }]; let arguments = ( - ins TensorOf<[F32, I32, I64, I8, TFL_Uint8]>:$lhs, - TensorOf<[F32, I32, I64, I8, TFL_Uint8]>:$rhs); + ins TensorOf<[F32, I32, I64, I8, TFL_QI8, TFL_QUI8, TFL_Uint8]>:$lhs, + TensorOf<[F32, I32, I64, I8, TFL_QI8, TFL_QUI8, TFL_Uint8]>:$rhs); let results = (outs TFL_BoolTensor:$output); @@ -699,7 +702,8 @@ convolutional neural networks (NIPS 2012)](http://papers.nips.cc/paper/4824-imag let hasOptions = 1; } -def TFL_GreaterEqualOp : TFL_Op<"greater_equal", [Broadcastable, NoSideEffect]> { +def TFL_GreaterEqualOp : TFL_Op<"greater_equal", [ + Broadcastable, NoSideEffect, TFL_NoQuantizableResult]> { let summary = "Greater_equal operator"; let description = [{ @@ -788,7 +792,8 @@ def TFL_EluOp: TFL_Op<"elu", [NoSideEffect, SameOperandsAndResultType]> { } def TFL_EqualOp: TFL_Op<"equal", [Commutative, Broadcastable, - PredOpTrait<"Operands have same value type", TCopVTEtIsSameAs<0, 1>>]> { + TFL_NoQuantizableResult, + PredOpTrait<"Operands have same value type", TCopVTEtIsSameAs<0, 1>>]> { let summary = "Equal operator"; let description = [{ @@ -797,8 +802,8 @@ def TFL_EqualOp: TFL_Op<"equal", [Commutative, Broadcastable, let arguments = ( ins - TensorOf<[I1, F32, I32, I64, I8, TFL_Uint8]>:$x, - TensorOf<[I1, F32, I32, I64, I8, TFL_Uint8]>:$y + TensorOf<[I1, F32, I32, I64, I8, TFL_QI8, TFL_QUI8, TFL_Uint8]>:$x, + TensorOf<[I1, F32, I32, I64, I8, TFL_QI8, TFL_QUI8, TFL_Uint8]>:$y ); let results = (outs TFL_BoolTensor:$output); @@ -1169,7 +1174,8 @@ def TFL_MaxPool2DOp : TFL_Op<"max_pool_2d", [ let customOption = "Pool2DOptions"; } -def TFL_MaximumOp : TFL_Op<"maximum", [Broadcastable, NoSideEffect, Commutative]> { +def TFL_MaximumOp : TFL_Op<"maximum", [ + Broadcastable, NoSideEffect, Commutative, TFL_SameOperandsAndResultsScale]> { let summary = "Max operator"; let description = [{ Element-wise max operation. @@ -1187,7 +1193,7 @@ def TFL_MaximumOp : TFL_Op<"maximum", [Broadcastable, NoSideEffect, Commutative] let hasOptions = 0; } -def TFL_MeanOp : TFL_Op<"mean", [NoSideEffect]> { +def TFL_MeanOp : TFL_Op<"mean", [NoSideEffect, TFL_SameOperandsAndResultsScale]> { let summary = "Mean operator"; let description = [{ @@ -1199,12 +1205,13 @@ def TFL_MeanOp : TFL_Op<"mean", [NoSideEffect]> { }]; let arguments = (ins - TensorOf<[F32, I8, I32, I64, TFL_Uint8]>:$input, + TensorOf<[F32, I8, I32, I64, TFL_QI8, TFL_QUI8, TFL_Uint8]>:$input, TensorOf<[I32, I64]>:$axis, BoolAttr:$keep_dims ); - let results = (outs TensorOf<[F32, I32, I64, I8]>:$output); + let results = (outs + TensorOf<[F32, I32, I64, I8, TFL_QI8, TFL_QUI8, TFL_Uint8]>:$output); let hasOptions = 1; let customOption = "ReducerOptions"; @@ -1238,7 +1245,8 @@ def TFL_OneHotOp : TFL_Op<"one_hot", [NoSideEffect]> { let hasOptions = 1; } -def TFL_SliceOp : TFL_Op<"slice", [NoSideEffect]> { +def TFL_SliceOp : TFL_Op<"slice", [ + NoSideEffect, TFL_SameOperandsAndResultsScale]> { let summary = "Return a slice from 'input'."; let description = [{ @@ -1337,7 +1345,8 @@ def TFL_ReduceProdOp: TFL_Op<"reduce_prod", [NoSideEffect]> { let customOption = "ReducerOptions"; } -def TFL_MinimumOp : TFL_Op<"minimum", [Broadcastable, NoSideEffect, Commutative]> { +def TFL_MinimumOp : TFL_Op<"minimum", [ + Broadcastable, NoSideEffect, Commutative, TFL_SameOperandsAndResultsScale]> { let summary = "Min operator"; let description = [{ Element-wise min operation. @@ -1442,6 +1451,7 @@ def TFL_PackOp : TFL_Op<"pack", [NoSideEffect]> { def TFL_PadOp : TFL_Op<"pad", [ NoSideEffect, + TFL_SameOperandsAndResultsScale, TFL_OperandHasRank<1, 2>, TFL_OperandRankEquals1DimOfOperand<0, 1>]> { let summary = "Padding operator"; @@ -1471,16 +1481,17 @@ def TFL_PadOp : TFL_Op<"pad", [ }]; let arguments = ( - ins TensorOf<[F32, I8, I32, I64]>:$input, + ins TensorOf<[F32, I8, I32, I64, TFL_QI8, TFL_QUI8]>:$input, TFL_I32OrI64Tensor:$padding); - let results = (outs TensorOf<[F32, I8, I32, I64]>:$output); + let results = (outs TensorOf<[F32, I8, I32, I64, TFL_QI8, TFL_QUI8]>:$output); let hasOptions = 1; } def TFL_PadV2Op : TFL_Op<"padv2", [ NoSideEffect, + TFL_SameOperandsAndResultsScale, TFL_OperandHasRank<1, 2>, TFL_OperandHasRank<2, 0>, TFL_OperandRankEquals1DimOfOperand<0, 1>, @@ -1515,11 +1526,11 @@ def TFL_PadV2Op : TFL_Op<"padv2", [ }]; let arguments = ( - ins TensorOf<[F32, I8, I32, I64]>:$input, + ins TensorOf<[F32, I8, I32, I64, TFL_QI8, TFL_QUI8]>:$input, TFL_I32OrI64Tensor:$padding, TensorOf<[F32, I8, I32, I64]>:$constant_values); - let results = (outs TensorOf<[F32, I8, I32, I64]>:$output); + let results = (outs TensorOf<[F32, I8, I32, I64, TFL_QI8, TFL_QUI8]>:$output); let hasOptions = 1; } @@ -1663,7 +1674,13 @@ def TFL_ShapeOp: TFL_Op<"shape", [NoSideEffect, TFL_NoQuantizableResult]> { let hasOptions = 1; } -def TFL_LogisticOp: TFL_Op<"logistic", [NoSideEffect, SameOperandsAndResultType]> { +def TFL_LogisticOp: TFL_Op<"logistic", [ + NoSideEffect, + SameOperandsAndResultType, + // zero_point = 0 + // scale = 1. / (max_value + 1) + TFL_FixedResultScale>, + TFL_FixedResultScale>]> { let summary = "Logistic operator"; let description = [{ @@ -2019,6 +2036,7 @@ def TFL_ZerosLikeOp: TFL_Op<"zeros_like", [NoSideEffect]> { def TFL_BatchToSpaceNdOp: TFL_Op<"batch_to_space_nd", [ NoSideEffect, + TFL_SameOperandsAndResultsScale, PredOpTrait<"input and output must have same element type", TCresVTEtIsSameAsOp<0, 0>> ]> { @@ -2029,18 +2047,19 @@ def TFL_BatchToSpaceNdOp: TFL_Op<"batch_to_space_nd", [ }]; let arguments = (ins - TensorOf<[F32, I8, I32, I64]>:$input, + TensorOf<[F32, I8, I32, I64, TFL_QI8, TFL_QUI8]>:$input, TensorOf<[I32]>:$block_shape, TensorOf<[I32]>:$indices ); let results = (outs - TensorOf<[F32, I16, I32, I64]>:$output + TensorOf<[F32, I16, I32, I64, TFL_QI8, TFL_QUI8]>:$output ); } def TFL_SpaceToBatchNdOp: TFL_Op<"space_to_batch_nd", [ NoSideEffect, + TFL_SameOperandsAndResultsScale, PredOpTrait<"input and output must have same element type", TCresVTEtIsSameAsOp<0, 0>> ]> { @@ -2051,13 +2070,13 @@ def TFL_SpaceToBatchNdOp: TFL_Op<"space_to_batch_nd", [ }]; let arguments = (ins - TensorOf<[F32, I8, I32, I64]>:$input, + TensorOf<[F32, I8, I32, I64, TFL_QI8, TFL_QUI8]>:$input, TensorOf<[I32]>:$block_shape, TensorOf<[I32]>:$paddings ); let results = (outs - TensorOf<[F32, I16, I32, I64]>:$output + TensorOf<[F32, I16, I32, I64, TFL_QI8, TFL_QUI8]>:$output ); } @@ -2106,7 +2125,8 @@ def TFL_SplitVOp : TFL_Op<"split_v", [NoSideEffect]> { let hasOptions = 1; } -def TFL_ResizeBilinearOp: TFL_Op<"resize_bilinear", [NoSideEffect]> { +def TFL_ResizeBilinearOp: TFL_Op<"resize_bilinear", [ + NoSideEffect, TFL_SameOperandsAndResultsScale]> { let summary = "ResizeBilinear Op"; let description = [{ @@ -2115,12 +2135,12 @@ def TFL_ResizeBilinearOp: TFL_Op<"resize_bilinear", [NoSideEffect]> { let arguments = (ins // TODO(ycling): Support quantized types. - TensorOf<[F32, I32]>:$input, + TensorOf<[F32, I32, TFL_QI8, TFL_QUI8]>:$input, TensorOf<[I32]>:$size, BoolAttr:$align_corners); let results = (outs - TensorOf<[F32]>:$output + TensorOf<[F32, TFL_QI8, TFL_QUI8]>:$output ); let hasOptions = 1; diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir index ec31bf34b70..c627b9ebc3e 100644 --- a/tensorflow/compiler/mlir/lite/tests/ops.mlir +++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir @@ -879,7 +879,7 @@ func @testResizeBilinear(%arg0 : tensor<1x100x100x3xf32>, %arg1 : tensor<4xi32>) // ----- func @testResizeBilinearInvalidOutputType(%arg0 : tensor<1x100x100x3xf32>, %arg1 : tensor<4xi32>) -> tensor { - // expected-error @+1 {{'tfl.resize_bilinear' op result #0 must be tensor of 32-bit float values}} + // expected-error @+1 {{'tfl.resize_bilinear' op result #0 must be tensor of 32-bit float or QI8 type or QUI8 type values}} %0 = "tfl.resize_bilinear"(%arg0, %arg1) {align_corners = false} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor return %0 : tensor } diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir index a3e7c01ca91..f2ca7136d54 100644 --- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir +++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir @@ -35,6 +35,27 @@ func @QuantizeConv2D(tensor<1x224x224x3x!quant.uniform // CHECK: return %6 } +// CHECK-LABEL: QuantizeFullyConnected +func @QuantizeFullyConnected(tensor<1x224x224x3x!quant.uniform>) -> tensor<1x112x112x32x!quant.uniform> { +^bb0(%arg0: tensor<1x224x224x3x!quant.uniform>): + %cst = constant dense<-1.23697901> : tensor<32xf32> + %2 = "tfl.dequantize"(%arg0) : (tensor<1x224x224x3x!quant.uniform>) -> tensor<1x224x224x3xf32> + %3 = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>} : () -> tensor<32x3x3x3x!quant.uniform:f32, 0.021826678373682216:151>> + %4 = "tfl.dequantize"(%3) : (tensor<32x3x3x3x!quant.uniform:f32, 0.021826678373682216:151>>) -> tensor<32x3x3x3xf32> + %5 = "tfl.fully_connected"(%2, %4, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x224x224x3xf32>, tensor<32x3x3x3xf32>, tensor<32xf32>) -> tensor<1x112x112x32xf32> + %6 = "tfl.quantize"(%5) {qtype = tensor<1x112x112x32x!quant.uniform>} : (tensor<1x112x112x32xf32>) -> tensor<1x112x112x32x!quant.uniform> + return %6 : tensor<1x112x112x32x!quant.uniform> + +// CHECK: %cst = constant dense<-1.23697901> : tensor<32xf32> +// CHECK: %0 = "tfl.quantize"(%cst) {qtype = tensor<32x!quant.uniform>} +// CHECK: %1 = "tfl.dequantize"(%0) : (tensor<32x!quant.uniform>) +// CHECK: %2 = "tfl.dequantize"(%arg0) +// CHECK: %3 = "tfl.pseudo_qconst"() +// CHECK: %4 = "tfl.dequantize"(%3) +// CHECK: %5 = "tfl.fully_connected"(%2, %4, %1) +// CHECK: %6 = "tfl.quantize"(%5) +// CHECK: return %6 +} // CHECK-LABEL: QuantizeDepthwiseConv2D func @QuantizeDepthwiseConv2D(tensor<1x224x224x3x!quant.uniform>) -> tensor<1x112x112x32x!quant.uniform> { @@ -74,6 +95,66 @@ func @QuantizeAveragePool2D(tensor<1x6x6x16x!quant.uniform } +// CHECK-LABEL: QuantizeMaximum +func @QuantizeMaximum(tensor<1x6x6x16x!quant.uniform>, tensor<1x6x6x16x!quant.uniform>) -> tensor<1x6x6x16xf32> { +^bb0(%arg0: tensor<1x6x6x16x!quant.uniform>, %arg1: tensor<1x6x6x16x!quant.uniform>): + %0 = "tfl.dequantize"(%arg0) : (tensor<1x6x6x16x!quant.uniform>) -> tensor<1x6x6x16xf32> + %1 = "tfl.dequantize"(%arg1) : (tensor<1x6x6x16x!quant.uniform>) -> tensor<1x6x6x16xf32> + %2 = "tfl.maximum"(%0, %1) : (tensor<1x6x6x16xf32>, tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32> + return %2 : tensor<1x6x6x16xf32> + +// CHECK: %0 = "tfl.dequantize"(%arg0) +// CHECK: %1 = "tfl.dequantize"(%arg1) +// CHECK: %2 = "tfl.maximum"(%0, %1) +// CHECK: %3 = "tfl.quantize"(%2) +// CHECK: %4 = "tfl.dequantize"(%3) +// CHECK: return %4 : tensor<1x6x6x16xf32> +} + +// CHECK-LABEL: QuantizeMinimum +func @QuantizeMinimum(tensor<1x6x6x16x!quant.uniform>, tensor<1x6x6x16x!quant.uniform>) -> tensor<1x6x6x16xf32> { +^bb0(%arg0: tensor<1x6x6x16x!quant.uniform>, %arg1: tensor<1x6x6x16x!quant.uniform>): + %0 = "tfl.dequantize"(%arg0) : (tensor<1x6x6x16x!quant.uniform>) -> tensor<1x6x6x16xf32> + %1 = "tfl.dequantize"(%arg1) : (tensor<1x6x6x16x!quant.uniform>) -> tensor<1x6x6x16xf32> + %2 = "tfl.minimum"(%0, %1) : (tensor<1x6x6x16xf32>, tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32> + return %2 : tensor<1x6x6x16xf32> + +// CHECK: %0 = "tfl.dequantize"(%arg0) +// CHECK: %1 = "tfl.dequantize"(%arg1) +// CHECK: %2 = "tfl.minimum"(%0, %1) +// CHECK: %3 = "tfl.quantize"(%2) +// CHECK: %4 = "tfl.dequantize"(%3) +// CHECK: return %4 : tensor<1x6x6x16xf32> +} + +// CHECK-LABEL: QuantizeSlice +func @QuantizeSlice(tensor<2x3x5x!quant.uniform>, tensor<3xi32>, tensor<3xi32>) -> tensor { +^bb0(%arg0: tensor<2x3x5x!quant.uniform>, %arg1: tensor<3xi32>, %arg2: tensor<3xi32>): + %0 = "tfl.dequantize"(%arg0) : (tensor<2x3x5x!quant.uniform>) -> tensor<2x3x5xf32> + %1 = "tfl.slice"(%0, %arg1, %arg2) : (tensor<2x3x5xf32>, tensor<3xi32>, tensor<3xi32>) -> tensor + return %1 : tensor + +// CHECK: %0 = "tfl.dequantize"(%arg0) +// CHECK: %1 = "tfl.slice"(%0, %arg1, %arg2) +// CHECK: %2 = "tfl.quantize"(%1) +// CHECK: %3 = "tfl.dequantize"(%2) +// CHECK: return %3 : tensor +} + +// CHECK-LABEL: QuantizePad +func @QuantizePad(tensor<2x1x3x!quant.uniform>, tensor<3x2xi32>) -> tensor { +^bb0(%arg0: tensor<2x1x3x!quant.uniform>, %arg1: tensor<3x2xi32>): + %0 = "tfl.dequantize"(%arg0) : (tensor<2x1x3x!quant.uniform>) -> tensor<2x1x3xf32> + %1 = "tfl.pad"(%0, %arg1) : (tensor<2x1x3xf32>, tensor<3x2xi32>) -> tensor + return %1 : tensor + +// CHECK: %0 = "tfl.dequantize"(%arg0) +// CHECK: %1 = "tfl.pad"(%0, %arg1) +// CHECK: %2 = "tfl.quantize"(%1) +// CHECK: %3 = "tfl.dequantize"(%2) +// CHECK: return %3 : tensor +} + // CHECK-LABEL: QuantizeReshape2D func @QuantizeReshape2D(tensor<1x6x6x16x!quant.uniform>) -> tensor<1x36x16xf32> { ^bb0(%arg0: tensor<1x6x6x16x!quant.uniform>): @@ -102,6 +183,20 @@ func @QuantizeSoftmax(tensor<1x6x6x16x!quant.uniform>) // CHECK: return %3 : tensor<1x6x6x16xf32> } +// CHECK-LABEL: QuantizeLogistic +func @QuantizeLogistic(tensor<1x6x6x16x!quant.uniform>) -> tensor<1x6x6x16xf32> { +^bb0(%arg0: tensor<1x6x6x16x!quant.uniform>): + %0 = "tfl.dequantize"(%arg0) : (tensor<1x6x6x16x!quant.uniform>) -> tensor<1x6x6x16xf32> + %1 = "tfl.logistic"(%0) : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32> + return %1 : tensor<1x6x6x16xf32> + +// CHECK: %0 = "tfl.dequantize"(%arg0) +// CHECK: %1 = "tfl.logistic"(%0) : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32> +// CHECK: %2 = "tfl.quantize"(%1) {qtype = tensor<1x6x6x16x!quant.uniform>} +// CHECK: %3 = "tfl.dequantize"(%2) : (tensor<1x6x6x16x!quant.uniform>) -> tensor<1x6x6x16xf32> +// CHECK: return %3 : tensor<1x6x6x16xf32> +} + // CHECK-LABEL: QuantizeConcatOperand0ToAll func @QuantizeConcatOperand0ToAll(tensor<2x!quant.uniform>, tensor<2xf32>) -> tensor<2x2xf32> { ^bb0(%arg0: tensor<2x!quant.uniform>, %arg1: tensor<2xf32>): diff --git a/tensorflow/compiler/mlir/lite/utils/quantization_driver.cc b/tensorflow/compiler/mlir/lite/utils/quantization_driver.cc index 1ab00ec3634..956c1f1434d 100644 --- a/tensorflow/compiler/mlir/lite/utils/quantization_driver.cc +++ b/tensorflow/compiler/mlir/lite/utils/quantization_driver.cc @@ -432,6 +432,9 @@ void QuantizationDriver::QuantizeValue(Value *value, QuantParams params, Location loc) { Type expressed_type = value->getType(); Type new_type = params.castFromExpressedType(expressed_type); + // This value isn't an expressed type (float), skip. + if (!new_type) return; + TypeAttr type_attr = builder_.getTypeAttr(new_type); auto quantize = builder_.create(loc, new_type, value, type_attr); @@ -482,10 +485,15 @@ void QuantizationDriver::RequantizeValue(Value *value, RequantizeState *state, } else { Type expressed_type = quant::QuantizedType::castToExpressedType(value->getType()); + if (!expressed_type) return; + // The value needs to be requantized. A Quantize op will be created to use // it as the operand and replace its uses. new_type = state->params.castFromExpressedType(expressed_type); } + // This value isn't an expressed type (float), skip. + if (!new_type) return; + TypeAttr type_attr = builder_.getTypeAttr(new_type); auto requantize_op = builder_.create(loc, new_type, value, type_attr); @@ -648,11 +656,13 @@ bool QuantizationDriver::PropagateParams() { for (int res = 0, e = op->getNumResults(); res != e; ++res) changed |= SetResultParams(op, res, params); } + // TODO(fengliuai): make the bit width configurable. auto key = std::make_pair(8, is_signed_); auto &restricted_outputs = spec->restricted_output_params[key]; - for (int i = 0, e = restricted_outputs.size(); i != e; ++i) + for (int i = 0, e = restricted_outputs.size(); i != e; ++i) { changed |= SetResultParams(op, i, restricted_outputs[i]); + } for (auto &it : spec->biases_params) { auto params = diff --git a/tensorflow/compiler/mlir/lite/utils/quantization_utils.h b/tensorflow/compiler/mlir/lite/utils/quantization_utils.h index 941ce636bc1..d2c58084679 100644 --- a/tensorflow/compiler/mlir/lite/utils/quantization_utils.h +++ b/tensorflow/compiler/mlir/lite/utils/quantization_utils.h @@ -63,8 +63,14 @@ struct GenericFullQuantizationPattern : public RewritePattern { inputs.reserve(quantized_op->getNumOperands()); for (int i = 0, e = quantized_op->getNumOperands(); i != e; ++i) { auto* operand = quantized_op->getOperand(i); + auto operand_ele_type = + operand->getType().template cast().getElementType(); if (auto op_inst = dyn_cast_or_null(operand->getDefiningOp())) { inputs.push_back(op_inst.input()); + } else if (operand_ele_type.template isa()) { + // If the operand is an integer tensor, then it doesn't require the + // DQ op in the pattern. + inputs.push_back(operand); } else { return matchFailure(); } From fbb355779e65a003954ec17f78af1ebe15ab71c3 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Wed, 24 Jul 2019 11:00:42 -0700 Subject: [PATCH 0474/3053] Simplify graphdef2mlir/graph-functional-while-loop.pbtxt test This test is intended to check that we don't error out and produce a valid IR when importing a Graph with functions that have input names conflicting with the main graph input names. There isn't much to CHECK in the output, the verifier is ran which should be enough. We reduce the maintenance on the test by removing most of the CHECK. PiperOrigin-RevId: 259775067 --- .../graph-functional-while-loop.pbtxt | 27 +++++-------------- 1 file changed, 7 insertions(+), 20 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-functional-while-loop.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-functional-while-loop.pbtxt index 456bf4951bd..ba94c600cf2 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-functional-while-loop.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-functional-while-loop.pbtxt @@ -1,5 +1,12 @@ # RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-data-types=DT_INT32 -tf-input-shapes='' -tf-output-arrays=while:2 -o - | FileCheck %s +# This check that we don't error out when importing GraphDef containing +# functions with arg name that are the same as the graph input name + +# CHECK: func @main(%arg0: tensor) -> tensor +# CHECK: func @while_body +# CHECK: func @while_cond + node { name: "input" op: "Placeholder" @@ -295,23 +302,3 @@ versions { min_consumer: 12 } -# CHECK: func @main(%arg0: tensor) -> tensor -# CHECK-NEXT: attributes {tf.entry_function = {inputs = "input", outputs = "while"}} { -# CHECK-NEXT: %0:2 = "_tf.Placeholder.input"(%arg0) {_user_specified_name = "input", device = "", dtype = "tfdtype$DT_INT32", name = "input", shape = "tfshape$"} : (tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "while/loop_counter", value = dense<0> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %2:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "while/maximum_iterations", value = dense<-1> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %3:4 = "_tf.While"(%1#0, %2#0, %0#0) {T = ["tfdtype$DT_INT32", "tfdtype$DT_INT32", "tfdtype$DT_INT32"], _lower_using_switch_merge = true, body = @while_body_60, cond = @while_cond_50, device = "", name = "while", output_shapes = ["tfshape$", "tfshape$", "tfshape$"], parallel_iterations = 10 : i64} : (tensor, tensor, tensor) -> (tensor, tensor, tensor, !_tf.control) -# CHECK-NEXT: return %3#2 : tensor -# CHECK-NEXT: } -# CHECK: func @while_body_60(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %arg2: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>, tensor<*xi32>) { -# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Add/y", value = dense<1> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "add_1/y", value = dense<1> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %2:2 = "_tf.Add"(%arg2, %0#0) {T = "tfdtype$DT_INT32", device = "", name = "Add"} : (tensor<*xi32>, tensor) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %3:2 = "_tf.Add"(%arg0, %1#0) {T = "tfdtype$DT_INT32", device = "", name = "add_1"} : (tensor<*xi32>, tensor) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: return %3#0, %arg1, %2#0 : tensor<*xi32>, tensor<*xi32>, tensor<*xi32> -# CHECK-NEXT: } -# CHECK: func @while_cond_50(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %arg2: tensor<*xi32>) -> tensor<*xi1> { -# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Less/y", value = dense<10> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Less"(%arg2, %0#0) {T = "tfdtype$DT_INT32", device = "", name = "Less"} : (tensor<*xi32>, tensor) -> (tensor<*xi1>, !_tf.control) -# CHECK-NEXT: return %1#0 : tensor<*xi1> -# CHECK-NEXT: } From 4fe1667c2dc3cb8c56751babe9eda0742574b3af Mon Sep 17 00:00:00 2001 From: Gunhan Gulsoy Date: Wed, 24 Jul 2019 11:13:11 -0700 Subject: [PATCH 0475/3053] Disable broken lite_v2_test on kokoro until the breakage is resolved. PiperOrigin-RevId: 259777912 --- tensorflow/lite/python/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD index 9316da8e94c..df7c07ff5d4 100644 --- a/tensorflow/lite/python/BUILD +++ b/tensorflow/lite/python/BUILD @@ -111,6 +111,7 @@ py_test( srcs = ["lite_v2_test.py"], srcs_version = "PY2AND3", tags = [ + "no_oss", "no_windows", ], deps = [ From 3198b9be2ee031f3ebcb946b7fa6e81dec23fee0 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Wed, 24 Jul 2019 11:16:39 -0700 Subject: [PATCH 0476/3053] Simplify graphdef2mlir/graph-gradient-def.pbtxt test to be more targeted This test intends to check that we correctly add an a function attribute to link to the gradient function. The CHECK lines are updated to check specifically this property. PiperOrigin-RevId: 259778608 --- .../graphdef2mlir/graph-gradient-def.pbtxt | 30 +++++-------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-gradient-def.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-gradient-def.pbtxt index c1045bf19af..b7179ae1dcc 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-gradient-def.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-gradient-def.pbtxt @@ -1,5 +1,12 @@ # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s +# In GraphDef custom gradient functions are modeled using GradientDef which +# links the function and its gradient. In MLIR a TF ops gradient function is +# added to its list of function attributes. + +# CHECK: func @foo0( +# CHECK-NEXT: tf.gradient = @foo_grad + node { name: "Const" op: "Const" @@ -269,26 +276,3 @@ versions { producer: 29 min_consumer: 12 } - -# CHECK: func @main() { -# CHECK-NEXT: %0:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "Const", value = dense<2.500000e-01> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.foo0"(%0#0) {_disable_call_shape_inference = true, device = "", name = "foo"} : (tensor) -> (tensor<*xf32>, !_tf.control) -# CHECK-NEXT: %2:2 = "_tf.Shape"(%1#0) {T = "tfdtype$DT_FLOAT", device = "", name = "gradients/Shape", out_type = "tfdtype$DT_INT32"} : (tensor<*xf32>) -> (tensor, !_tf.control) -# CHECK-NEXT: %3:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "gradients/grad_ys_0", value = dense<1.000000e+00> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %4:2 = "_tf.Fill"(%2#0, %3#0) {T = "tfdtype$DT_FLOAT", device = "", index_type = "tfdtype$DT_INT32", name = "gradients/Fill"} : (tensor, tensor) -> (tensor<*xf32>, !_tf.control) -# CHECK-NEXT: %5:2 = "_tf.SymbolicGradient"(%0#0, %4#0) {Tin = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], Tout = ["tfdtype$DT_FLOAT"], device = "", f = @foo0, f._disable_call_shape_inference = true, name = "gradients/foo_grad/SymbolicGradient"} : (tensor, tensor<*xf32>) -> (tensor, !_tf.control) -# CHECK-NEXT: return -# CHECK-NEXT: } -# CHECK: func @foo_grad0(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> -# CHECK-NEXT: attributes {tf._disable_call_shape_inference = true} { -# CHECK-NEXT: %0:2 = "_tf.Mul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", device = "", name = "mul_0"} : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xf32>, !_tf.control) -# CHECK-NEXT: return %0#0 : tensor<*xf32> -# CHECK-NEXT: } -# CHECK: func @foo0(%arg0: tensor<*xf32>) -> tensor<*xf32> -# CHECK-NEXT: attributes {tf._disable_call_shape_inference = true, tf.gradient = @foo_grad0} { -# CHECK-NEXT: %0:2 = "_tf.Exp"(%arg0) {T = "tfdtype$DT_FLOAT", device = "", name = "Exp"} : (tensor<*xf32>) -> (tensor<*xf32>, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Neg"(%arg0) {T = "tfdtype$DT_FLOAT", device = "", name = "Neg"} : (tensor<*xf32>) -> (tensor<*xf32>, !_tf.control) -# CHECK-NEXT: %2:2 = "_tf.Exp"(%1#0) {T = "tfdtype$DT_FLOAT", device = "", name = "Exp_1"} : (tensor<*xf32>) -> (tensor<*xf32>, !_tf.control) -# CHECK-NEXT: %3:2 = "_tf.Sub"(%0#0, %2#0) {T = "tfdtype$DT_FLOAT", device = "", name = "sub_0"} : (tensor<*xf32>, tensor<*xf32>) -> (tensor<*xf32>, !_tf.control) -# CHECK-NEXT: return %3#0 : tensor<*xf32> -# CHECK-NEXT: } From b3aafbda35b0d2d9a3a7647cd64971b43d23338b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 11:16:56 -0700 Subject: [PATCH 0477/3053] Move docs for Python inference into guide/inference.md, and restructure that page to organize the load/run steps based on language. PiperOrigin-RevId: 259778674 --- tensorflow/lite/g3doc/convert/python_api.md | 79 +---- tensorflow/lite/g3doc/guide/get_started.md | 55 +-- tensorflow/lite/g3doc/guide/inference.md | 352 ++++++++++++-------- 3 files changed, 253 insertions(+), 233 deletions(-) diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md index 1dd37ffdfd3..777c363e7fb 100644 --- a/tensorflow/lite/g3doc/convert/python_api.md +++ b/tensorflow/lite/g3doc/convert/python_api.md @@ -1,9 +1,12 @@ # Converter Python API guide -This page provides examples on how to use the TensorFlow Lite Converter and the -TensorFlow Lite interpreter using the Python API. +This page describes how to convert TensorFlow models into the TensorFlow Lite +format using the TensorFlow Lite Converter Python API. -Note: These docs describe the converter in the TensorFlow nightly release, +If you're looking for information about how to run a TensorFlow Lite model, +see [TensorFlow Lite inference](../guide/inference.md). + +Note: This page describes the converter in the TensorFlow nightly release, installed using `pip install tf-nightly`. For docs describing older versions reference ["Converting models from TensorFlow 1.12"](#pre_tensorflow_1.12). @@ -20,13 +23,12 @@ be targeted to devices with mobile. ## API The API for converting TensorFlow models to TensorFlow Lite is -`tf.lite.TFLiteConverter`. The API for calling the Python interpreter is -`tf.lite.Interpreter`. +`tf.lite.TFLiteConverter`, which provides class methods based on the original +format of the model. For example, `TFLiteConverter.from_session()` is available +for GraphDefs, `TFLiteConverter.from_saved_model()` is available for +SavedModels, and `TFLiteConverter.from_keras_model_file()` is available for +`tf.Keras` files. -`TFLiteConverter` provides class methods based on the original format of the -model. `TFLiteConverter.from_session()` is available for GraphDefs. -`TFLiteConverter.from_saved_model()` is available for SavedModels. -`TFLiteConverter.from_keras_model_file()` is available for `tf.Keras` files. Example usages for simple float-point models are shown in [Basic Examples](#basic). Examples usages for more complex models is shown in [Complex Examples](#complex). @@ -177,65 +179,6 @@ with tf.Session() as sess: open("converted_model.tflite", "wb").write(tflite_model) ``` -## TensorFlow Lite Python interpreter - -### Using the interpreter from a model file - -The following example shows how to use the TensorFlow Lite Python interpreter -when provided a TensorFlow Lite FlatBuffer file. The example also demonstrates -how to run inference on random input data. Run -`help(tf.lite.Interpreter)` in the Python terminal to get detailed -documentation on the interpreter. - -```python -import numpy as np -import tensorflow as tf - -# Load TFLite model and allocate tensors. -interpreter = tf.lite.Interpreter(model_path="converted_model.tflite") -interpreter.allocate_tensors() - -# Get input and output tensors. -input_details = interpreter.get_input_details() -output_details = interpreter.get_output_details() - -# Test model on random input data. -input_shape = input_details[0]['shape'] -input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32) -interpreter.set_tensor(input_details[0]['index'], input_data) - -interpreter.invoke() - -# The function `get_tensor()` returns a copy of the tensor data. -# Use `tensor()` in order to get a pointer to the tensor. -output_data = interpreter.get_tensor(output_details[0]['index']) -print(output_data) -``` - -### Using the interpreter from model data - -The following example shows how to use the TensorFlow Lite Python interpreter -when starting with the TensorFlow Lite Flatbuffer model previously loaded. This -example shows an end-to-end use case, starting from building the TensorFlow -model. - -```python -import numpy as np -import tensorflow as tf - -img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3)) -const = tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.]) -val = img + const -out = tf.identity(val, name="out") - -with tf.Session() as sess: - converter = tf.lite.TFLiteConverter.from_session(sess, [img], [out]) - tflite_model = converter.convert() - -# Load TFLite model and allocate tensors. -interpreter = tf.lite.Interpreter(model_content=tflite_model) -interpreter.allocate_tensors() -``` ## Additional instructions diff --git a/tensorflow/lite/g3doc/guide/get_started.md b/tensorflow/lite/g3doc/guide/get_started.md index e20dc08d0ca..72ddff4a8f0 100644 --- a/tensorflow/lite/g3doc/guide/get_started.md +++ b/tensorflow/lite/g3doc/guide/get_started.md @@ -4,22 +4,27 @@ TensorFlow Lite provides all the tools you need to convert and run TensorFlow models on mobile, embedded, and IoT devices. The following guide walks through each step of the developer workflow and provides links to further instructions. +[TOC] + ## 1. Choose a model -TensorFlow Lite allows you to run TensorFlow models on a wide range of devices. A TensorFlow model is a data structure that contains the logic and knowledge of a machine learning network trained to solve a particular problem. - There are many ways to obtain a TensorFlow model, from using pre-trained models -to training your own. To use a model with TensorFlow Lite it must be converted -into a special format. This is explained in section 2, -[Convert the model](#2_convert_the_model_format). +to training your own. + +To use a model with TensorFlow Lite, you must convert a +full TensorFlow model into the TensorFlow Lite format—you +cannot create or train a model using TensorFlow Lite. So you must start with a +regular TensorFlow model, and then +[convert the model](#2_convert_the_model_format). + +Note: TensorFlow Lite supports a limited subset of TensorFlow operations, so not +all models can be converted. For details, read about the +[TensorFlow Lite operator compatibility](ops_compatibility.md). -Note: Not all TensorFlow models will work with TensorFlow Lite, since the -interpreter supports a limited subset of TensorFlow operations. See section 2, -[Convert the model](#2_convert_the_model_format) to learn about compatibility. ### Use a pre-trained model @@ -60,35 +65,37 @@ flowers with TensorFlow codelab. ### Train a custom model If you have designed and trained your own TensorFlow model, or you have trained -a model obtained from another source, you should convert it to the TensorFlow -Lite format before use. +a model obtained from another source, you must +[convert it to the TensorFlow Lite format](#2_convert_the_model_format). ## 2. Convert the model -TensorFlow Lite is designed to execute models efficiently on devices. Some of +TensorFlow Lite is designed to execute models efficiently on mobile and other +embedded devices with limited compute and memory resources. Some of this efficiency comes from the use of a special format for storing models. TensorFlow models must be converted into this format before they can be used by TensorFlow Lite. Converting models reduces their file size and introduces optimizations that do -not affect accuracy. Developers can opt to further reduce file size and increase -speed of execution in exchange for some trade-offs. You can use the TensorFlow -Lite converter to choose which optimizations to apply. +not affect accuracy. The TensorFlow Lite converter provides options +that allow you to further reduce file size and increase speed of execution, with +some trade-offs. + +Note: TensorFlow Lite supports a limited subset of TensorFlow operations, so not +all models can be converted. For details, read about the +[TensorFlow Lite operator compatibility](ops_compatibility.md). -TensorFlow Lite supports a limited subset of TensorFlow operations, so not all -models can be converted. See [Ops compatibility](#ops-compatibility) for more -information. ### TensorFlow Lite converter -The [TensorFlow Lite converter](../convert) is a tool that converts trained -TensorFlow models into the TensorFlow Lite format. It can also introduce -optimizations, which are covered in section 4, +The [TensorFlow Lite converter](../convert) is a tool available as a Python API +that converts trained TensorFlow models into the TensorFlow Lite format. It can +also introduce optimizations, which are covered in section 4, [Optimize your model](#4_optimize_your_model_optional). -The converter is available as a Python API. The following example shows a +The following example shows a TensorFlow `SavedModel` being converted into the TensorFlow Lite format: ```python @@ -128,9 +135,9 @@ performance or reduce file size. This is covered in section 4, ### Ops compatibility -TensorFlow Lite currently supports a [limited subset](ops_compatibility.md) of -TensorFlow operations. The long term goal is for all TensorFlow operations to be -supported. +TensorFlow Lite currently supports a [limited subset of TensorFlow +operations](ops_compatibility.md). The long term goal is for all TensorFlow +operations to be supported. If the model you wish to convert contains unsupported operations, you can use [TensorFlow Select](ops_select.md) to include operations from TensorFlow. This diff --git a/tensorflow/lite/g3doc/guide/inference.md b/tensorflow/lite/g3doc/guide/inference.md index 353a656740e..4f5ddeb976b 100644 --- a/tensorflow/lite/g3doc/guide/inference.md +++ b/tensorflow/lite/g3doc/guide/inference.md @@ -1,91 +1,104 @@ # TensorFlow Lite inference The term *inference* refers to the process of executing a TensorFlow Lite model -on-device in order to make predictions based on input data. Inference is the -final step in using the model on-device. +on-device in order to make predictions based on input data. To perform an +inference with a TensorFlow Lite model, you must run it through an +*interpreter*. The TensorFlow Lite interpreter is designed to be lean and fast. +The interpreter uses a static graph ordering and a custom (less-dynamic) memory +allocator to ensure minimal load, initialization, and execution latency. -Inference for TensorFlow Lite models is run through an interpreter. The -TensorFlow Lite interpreter is designed to be lean and fast. The interpreter -uses a static graph ordering and a custom (less-dynamic) memory allocator to -ensure minimal load, initialization, and execution latency. +This page describes how to access to the TensorFlow Lite interpreter and +perform an inference using C++, Java, and Python, plus links to other resources +for each [supported platform](#supported-platforms). -This document outlines the various APIs for the interpreter, along with the -[supported platforms](#supported-platforms). +[TOC] -### Important Concepts +## Important concepts -TensorFlow Lite inference on device typically follows the following steps. +TensorFlow Lite inference typically follows the following steps: -1. **Loading a Model** +1. **Loading a model** - The user loads the `.tflite` model into memory which contains the model's + You must load the `.tflite` model into memory, which contains the model's execution graph. -1. **Transforming Data** - Input data acquired by the user generally may not match the input data format - expected by the model. For eg., a user may need to resize an image or change - the image format to be used by the model. +1. **Transforming data** -1. **Running Inference** + Raw input data for the model generally does not match the input data format + expected by the model. For example, you might need to resize an image or + change the image format to be compatible with the model. - This step involves using the API to execute the model. It involves a few - steps such as building the interpreter, and allocating tensors as explained - in detail in [Running a Model](#running_a_model). +1. **Running inference** -1. **Interpreting Output** + This step involves using the TensorFlow Lite API to execute the model. It + involves a few steps such as building the interpreter, and allocating + tensors, as described in the following sections. - The user retrieves results from model inference and interprets the tensors in - a meaningful way to be used in the application. +1. **Interpreting output** - For example, a model may only return a list of probabilities. It is up to the - application developer to meaningully map them to relevant categories and - present it to their user. + When you receive results from the model inference, you must interpret the + tensors in a meaningful way that's useful in your application. -### Supported Platforms + For example, a model might return only a list of probabilities. It's up to + you to map the probabilities to relevant categories and present it to your + end-user. + +## Supported platforms TensorFlow inference APIs are provided for most common mobile/embedded platforms -such as Android, iOS and Linux. +such as Android, iOS and Linux, in multiple programming languages. -#### Android +In most cases, the API design reflects a preference for performance over ease of +use. TensorFlow Lite is designed for fast inference on small devices, so it +should be no surprise that the APIs try to avoid unnecessary copies at the +expense of convenience. Similarly, consistency with TensorFlow APIs was not an +explicit goal and some variance between languages is to be expected. + +Across all libraries, the TensorFlow Lite API enables you to load models, +feed inputs, and retrieve inference outputs. + +### Android On Android, TensorFlow Lite inference can be performed using either Java or C++ APIs. The Java APIs provide convenience and can be used directly within your Android Activity classes. The C++ APIs offer more flexibility and speed, but may require writing JNI wrappers to move data between Java and C++ layers. -Visit the [Android quickstart](android.md) for a tutorial and example code. +See below for details about using C++ and Java, or +follow the [Android quickstart](android.md) for a tutorial and example code. -#### iOS +### iOS -TensorFlow Lite provides native iOS libraries written in +On iOS, TensorFlow Lite is available with native iOS libraries written in [Swift](https://www.tensorflow.org/code/tensorflow/lite/experimental/swift) and [Objective-C](https://www.tensorflow.org/code/tensorflow/lite/experimental/objc). -Visit the [iOS quickstart](ios.md) for a tutorial and example code. +This page doesn't include a discussion for about these languages, so you should +refer to the [iOS quickstart](ios.md) for a tutorial and example code. -#### Linux -On Linux platforms such as [Raspberry Pi](build_rpi.md), TensorFlow Lite C++ -and Python APIs can be used to run inference. +### Linux + +On Linux platforms (including [Raspberry Pi](build_rpi.md)), you can run +inferences using TensorFlow Lite APIs available in C++ and Python, as shown +in the following sections. -## API Guides +## Load and run a model in C++ -TensorFlow Lite provides programming APIs in C++, Java and Python, with -experimental bindings for several other languages (C, Swift, Objective-C). In -most cases, the API design reflects a preference for performance over ease of -use. TensorFlow Lite is designed for fast inference on small devices so it -should be no surprise that the APIs try to avoid unnecessary copies at the -expense of convenience. Similarly, consistency with TensorFlow APIs was not an -explicit goal and some variance is to be expected. +Running a TensorFlow Lite model with C++ involves a few simple steps: -There is also a [Python API for TensorFlow Lite](../convert/python_api.md). + 1. Load the model into memory as a `FlatBufferModel`. + 2. Build an `Interpreter` based on an existing `FlatBufferModel`. + 3. Set input tensor values. (Optionally resize input tensors if the + predefined sizes are not desired.) + 4. Invoke inference. + 5. Read output tensor values. -### Loading a Model - -#### C++ -The `FlatBufferModel` class encapsulates a model and can be built in a couple of -slightly different ways depending on where the model is stored: +The [`FlatBufferModel`]( +https://www.tensorflow.org/lite/api_docs/cc/class/tflite/flat-buffer-model.html) +class encapsulates a TensorFlow Lite model and you can +build it in a couple of different ways, depending on where the model is stored: ```c++ class FlatBufferModel { @@ -104,72 +117,36 @@ class FlatBufferModel { }; ``` -```c++ -tflite::FlatBufferModel model(path_to_model); -``` +Note: If TensorFlow Lite detects the presence of the [Android NNAPI]( +https://developer.android.com/ndk/guides/neuralnetworks), it will +automatically try to use shared memory to store the `FlatBufferModel`. -Note that if TensorFlow Lite detects the presence of Android's NNAPI it will -automatically try to use shared memory to store the FlatBufferModel. +Now that you have the model as a `FlatBufferModel` object, you can execute it +with an [`Interpreter`]( +https://www.tensorflow.org/lite/api_docs/cc/class/tflite/interpreter.html). +A single `FlatBufferModel` can be used +simultaneously by more than one `Interpreter`. -#### Java +Caution: The `FlatBufferModel` object must remain valid until +all instances of `Interpreter` using it have been destroyed. -TensorFlow Lite's Java API supports on-device inference and is provided as an -Android Studio Library that allows loading models, feeding inputs, and -retrieving inference outputs. - -The `Interpreter` class drives model inference with TensorFlow Lite. In -most of the cases, this is the only class an app developer will need. - -The `Interpreter` can be initialized with a model file using the constructor: - -```java -public Interpreter(@NotNull File modelFile); -``` - -or with a `MappedByteBuffer`: - -```java -public Interpreter(@NotNull MappedByteBuffer mappedByteBuffer); -``` - -In both cases a valid TensorFlow Lite model must be provided or an -`IllegalArgumentException` with be thrown. If a `MappedByteBuffer` is used to -initialize an Interpreter, it should remain unchanged for the whole lifetime of -the `Interpreter`. - -### Running a Model {#running_a_model} - -#### C++ -Running a model involves a few simple steps: - - * Build an `Interpreter` based on an existing `FlatBufferModel` - * Optionally resize input tensors if the predefined sizes are not desired. - * Set input tensor values - * Invoke inference - * Read output tensor values - -The important parts of public interface of the `Interpreter` are provided -below. It should be noted that: +The important parts of the `Interpreter` API are shown in the +code snippet below. It should be noted that: * Tensors are represented by integers, in order to avoid string comparisons (and any fixed dependency on string libraries). * An interpreter must not be accessed from concurrent threads. * Memory allocation for input and output tensors must be triggered - by calling AllocateTensors() right after resizing tensors. + by calling `AllocateTensors()` right after resizing tensors. -In order to run the inference model in TensorFlow Lite, one has to load the -model into a `FlatBufferModel` object which then can be executed by an -`Interpreter`. The `FlatBufferModel` needs to remain valid for the whole -lifetime of the `Interpreter`, and a single `FlatBufferModel` can be -simultaneously used by more than one `Interpreter`. In concrete terms, the -`FlatBufferModel` object must be created before any `Interpreter` objects that -use it, and must be kept around until they have all been destroyed. - -The simplest usage of TensorFlow Lite will look like this: +The simplest usage of TensorFlow Lite with C++ looks like this: ```c++ -tflite::FlatBufferModel model(path_to_model); +// Load the model +std::unique_ptr model = + tflite::FlatBufferModel::BuildFromFile(filename); +// Build the interpreter tflite::ops::builtin::BuiltinOpResolver resolver; std::unique_ptr interpreter; tflite::InterpreterBuilder(*model, resolver)(&interpreter); @@ -185,9 +162,40 @@ interpreter->Invoke(); float* output = interpreter->typed_output_tensor(0); ``` -#### Java +For more example code, see [`minimal.cc`]( +https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/minimal/minimal.cc) +and [`label_image.cc`]( +https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/label_image/label_image.cc). -The simplest usage of Tensorflow Lite Java API looks like this: + +## Load and run a model in Java + +The Java API for running an inference with TensorFlow Lite is primarily designed +for use with Android, so it's available as an Android library dependency: +`org.tensorflow:tensorflow-lite`. + +In Java, you'll use the `Interpreter` class to load a model and drive model +inference. In many cases, this may be the only API you need. + +You can initialize an `Interpreter` using a `.tflite` file: + +```java +public Interpreter(@NotNull File modelFile); +``` + +Or with a `MappedByteBuffer`: + +```java +public Interpreter(@NotNull MappedByteBuffer mappedByteBuffer); +``` + +In both cases, you must provide a valid TensorFlow Lite model or the API throws +`IllegalArgumentException`. If you use `MappedByteBuffer` to +initialize an `Interpreter`, it must remain unchanged for the whole lifetime +of the `Interpreter`. + +To then run an inference with the model, simply call `Interpreter.run()`. +For example: ```java try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) { @@ -195,48 +203,44 @@ try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) } ``` -If a model takes only one input and returns only one output, the following will -trigger an inference run: - -```java -interpreter.run(input, output); -``` - -For models with multiple inputs, or multiple outputs, use: +The `run()` method takes only one input and returns only one output. So if your +model has multiple inputs or multiple outputs, instead use: ```java interpreter.runForMultipleInputsOutputs(inputs, map_of_indices_to_outputs); ``` -where each entry in `inputs` corresponds to an input tensor and +In this case, each entry in `inputs` corresponds to an input tensor and `map_of_indices_to_outputs` maps indices of output tensors to the corresponding -output data. In both cases the tensor indices should correspond to the values -given to the -[TensorFlow Lite Optimized Converter](../convert/cmdline_examples.md) when the -model was created. Be aware that the order of tensors in `input` must match the -order given to the `TensorFlow Lite Optimized Converter`. +output data. -The Java API also provides convenient functions for app developers to get the -index of any model input or output using a tensor name: +In both cases, the tensor indices should correspond to the values you gave to +the [TensorFlow Lite Converter](../convert/) when you created the model. +Be aware that the order of tensors in `input` must match the +order given to the TensorFlow Lite Converter. + +The `Interpreter` class also provides convenient functions for you to get the +index of any model input or output using an operation name: ```java -public int getInputIndex(String tensorName); -public int getOutputIndex(String tensorName); +public int getInputIndex(String opName); +public int getOutputIndex(String opName); ``` -If tensorName is not a valid name in model, an `IllegalArgumentException` will -be thrown. +If `opName` is not a valid operation in the model, it throws an +`IllegalArgumentException`. -##### Releasing Resources After Use - -An `Interpreter` owns resources. To avoid memory leak, the resources must be -released after use by: +Also beware that `Interpreter` owns resources. To avoid memory leak, the +resources must be released after use by: ```java interpreter.close(); ``` -##### Supported Data Types +For an example project with Java, see the [Android image classification sample]( +https://github.com/tensorflow/examples/tree/master/lite/examples/image_classification/android). + +### Supported data types (in Java) To use TensorFlow Lite, the data types of the input and output tensors must be one of the following primitive types: @@ -256,7 +260,7 @@ provided as a single, flat `ByteBuffer` argument. If other data types, including boxed types like `Integer` and `Float`, are used, an `IllegalArgumentException` will be thrown. -##### Inputs +#### Inputs Each input should be an array or multi-dimensional array of the supported primitive types, or a raw `ByteBuffer` of the appropriate size. If the input is @@ -265,12 +269,12 @@ implicitly resized to the array's dimensions at inference time. If the input is a ByteBuffer, the caller should first manually resize the associated input tensor (via `Interpreter.resizeInput()`) before running inference. -When using 'ByteBuffer', prefer using direct byte buffers, as this allows the +When using `ByteBuffer`, prefer using direct byte buffers, as this allows the `Interpreter` to avoid unnecessary copies. If the `ByteBuffer` is a direct byte buffer, its order must be `ByteOrder.nativeOrder()`. After it is used for a model inference, it must remain unchanged until the model inference is finished. -##### Outputs +#### Outputs Each output should be an array or multi-dimensional array of the supported primitive types, or a ByteBuffer of the appropriate size. Note that some models @@ -279,7 +283,75 @@ the input. There's no straightforward way of handling this with the existing Java inference API, but planned extensions will make this possible. -## Writing Custom Operators +## Load and run a model in Python + +The Python API for running an inference is provided in the `tf.lite` +module. From which, you mostly need only [`tf.lite.Interpreter`]( +https://www.tensorflow.org/api_docs/python/tf/lite/Interpreter) to load +a model and run an inference. + +The following example shows how to use the Python interpreter to load a +`.tflite` file and run inference with random input data: + +```python +import numpy as np +import tensorflow as tf + +# Load TFLite model and allocate tensors. +interpreter = tf.lite.Interpreter(model_path="converted_model.tflite") +interpreter.allocate_tensors() + +# Get input and output tensors. +input_details = interpreter.get_input_details() +output_details = interpreter.get_output_details() + +# Test model on random input data. +input_shape = input_details[0]['shape'] +input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32) +interpreter.set_tensor(input_details[0]['index'], input_data) + +interpreter.invoke() + +# The function `get_tensor()` returns a copy of the tensor data. +# Use `tensor()` in order to get a pointer to the tensor. +output_data = interpreter.get_tensor(output_details[0]['index']) +print(output_data) +``` + +Alternative to loading the model as a pre-converted `.tflite` file, you can +combine your code with the [TensorFlow Lite Converter Python API]( +../convert/python_api.md) (`tf.lite.TFLiteConverter`), allowing you to convert +your TensorFlow model into the TensorFlow Lite format and then run an inference: + +```python +import numpy as np +import tensorflow as tf + +img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3)) +const = tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.]) +val = img + const +out = tf.identity(val, name="out") + +# Convert to TF Lite format +with tf.Session() as sess: + converter = tf.lite.TFLiteConverter.from_session(sess, [img], [out]) + tflite_model = converter.convert() + +# Load TFLite model and allocate tensors. +interpreter = tf.lite.Interpreter(model_content=tflite_model) +interpreter.allocate_tensors() + +# Continue to get tensors and so forth, as shown above... +``` + +For more Python sample code, see [`label_image.py`]( +https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/examples/python/label_image.py). + +Tip: Run `help(tf.lite.Interpreter)` in the Python terminal to get detailed +documentation about the interpreter. + + +## Write a custom operator All TensorFlow Lite operators (both custom and builtin) are defined using a simple pure-C interface that consists of four functions: @@ -343,7 +415,7 @@ Note that registration is not automatic and an explicit call to registration of builtins, custom ops will have to be collected in separate custom libraries. -### Customizing the kernel library +### Customize the kernel library Behind the scenes the interpreter will load a library of kernels which will be assigned to execute each of the operators in the model. While the default @@ -362,21 +434,19 @@ class OpResolver { }; ``` -Regular usage will require the developer to use the `BuiltinOpResolver` and -write: +Regular usage requires that you use the `BuiltinOpResolver` and write: ```c++ tflite::ops::builtin::BuiltinOpResolver resolver; ``` -They can then optionally register custom ops: +You can optionally register custom ops (before you pass the resolver to the +`InterpreterBuilder`): ```c++ resolver.AddOp("MY_CUSTOM_OP", Register_MY_CUSTOM_OP()); ``` -before the resolver is passed to the `InterpreterBuilder`. - If the set of builtin ops is deemed to be too large, a new `OpResolver` could be code-generated based on a given subset of ops, possibly only the ones contained in a given model. This is the equivalent of TensorFlow's selective From f001da35efe7769358345867196bea1e3f4badd6 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Wed, 24 Jul 2019 11:30:30 -0700 Subject: [PATCH 0478/3053] Simplify graphdef2mlir/graph-scalar-input.pbtxt test to be more targeted This test intends to check that we handle the command line flags for -tf-input/-tf-output, the CHECK lines are adjusted to reflect this. PiperOrigin-RevId: 259781337 --- .../graphdef2mlir/graph-scalar-input.pbtxt | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt index daef0054fd6..01a8a11216d 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-scalar-input.pbtxt @@ -1,5 +1,14 @@ # RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=input -tf-input-data-types=DT_FLOAT -tf-input-shapes='' -tf-output-arrays=out:1,out -o - | FileCheck %s +# Verify that we match correctly the input / output when they are scalar. + +# CHECK: func @main(%arg0: tensor) -> (tensor, tensor) +# CHECK-NEXT: attributes {tf.entry_function = {inputs = "input", outputs = "out"}} { +# CHECK: "_tf.Placeholder.input"(%arg0) + +# CHECK: %[[IDENTITY:[0-9]+]]:3 = "_tf.IdentityN" +# CHECK: return %[[IDENTITY]]#1, %[[IDENTITY]]#0 : tensor, tensor + node { name: "input" op: "Placeholder" @@ -52,11 +61,3 @@ node { versions { producer: 27 } - -# CHECK: func @main(%arg0: tensor) -> (tensor, tensor) -# CHECK-NEXT: attributes {tf.entry_function = {inputs = "input", outputs = "out"}} { -# CHECK-NEXT: %0:2 = "_tf.Placeholder.input"(%arg0) {device = "/device:CPU:0", dtype = "tfdtype$DT_FLOAT", name = "input", shape = "tfshape$"} : (tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Relu"(%0#0) {T = "tfdtype$DT_FLOAT", device = "/device:CPU:0", name = "Relu"} : (tensor) -> (tensor, !_tf.control) -# CHECK-NEXT: %2:3 = "_tf.IdentityN"(%1#0, %1#0) {T = ["tfdtype$DT_FLOAT", "tfdtype$DT_FLOAT"], device = "", name = "out"} : (tensor, tensor) -> (tensor, tensor, !_tf.control) -# CHECK-NEXT: return %2#1, %2#0 : tensor, tensor -# CHECK-NEXT: } From c16f5a89bd9c1413f84e7077080e65ccdbf3dfe1 Mon Sep 17 00:00:00 2001 From: "Xiaoming (Jason) Cui" Date: Tue, 23 Jul 2019 11:36:38 -0700 Subject: [PATCH 0479/3053] [INTEL_MKL] Updated the unit test analyzer_cli_test.py, so that it is compatible with recent changes in the graph rewrite logic for MKL-DNN support, in which the name and attributes of some ops have been changed, such as MatMul etc. --- .../python/debug/cli/analyzer_cli_test.py | 74 ++++++++++++------- 1 file changed, 48 insertions(+), 26 deletions(-) diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py index 586982dc4bf..1ce8745b245 100644 --- a/tensorflow/python/debug/cli/analyzer_cli_test.py +++ b/tensorflow/python/debug/cli/analyzer_cli_test.py @@ -46,6 +46,9 @@ from tensorflow.python.platform import googletest from tensorflow.python.platform import test from tensorflow.python.util import tf_inspect +def matmul_prefix(): + prefix = "_Mkl" if test_util.IsMklEnabled() else "" + return prefix def _cli_config_from_temp_file(): return cli_config.CLIConfig( @@ -665,11 +668,16 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): # Use shorthand alias for the command prefix. out = self._registry.dispatch_command("lt", []) - assert_listed_tensors(self, out, [ - "simple_mul_add/u:0", "simple_mul_add/v:0", "simple_mul_add/u/read:0", - "simple_mul_add/v/read:0", "simple_mul_add/matmul:0", - "simple_mul_add/add:0" - ], ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"]) + assert_listed_tensors( + self, + out, [ + "simple_mul_add/u:0", "simple_mul_add/v:0", + "simple_mul_add/u/read:0", + "simple_mul_add/v/read:0", "simple_mul_add/matmul:0", + "simple_mul_add/add:0" + ], + ["VariableV2", "VariableV2", "Identity", "Identity", + matmul_prefix() + "MatMul", "Add"]) # Check the main menu. check_main_menu(self, out, list_tensors_enabled=False) @@ -684,7 +692,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/u/read:0", "simple_mul_add/v/read:0", "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], - ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"], + ["VariableV2", "VariableV2", "Identity", "Identity", + matmul_prefix() + "MatMul", "Add"], sort_by="timestamp", reverse=True) check_main_menu(self, out, list_tensors_enabled=False) @@ -698,7 +707,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/u/read:0", "simple_mul_add/v/read:0", "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], - ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"], + ["VariableV2", "VariableV2", "Identity", "Identity", + matmul_prefix() + "MatMul", "Add"], sort_by="dump_size") check_main_menu(self, out, list_tensors_enabled=False) @@ -711,7 +721,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/u/read:0", "simple_mul_add/v/read:0", "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], - ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"], + ["VariableV2", "VariableV2", "Identity", "Identity", + matmul_prefix() + "MatMul", "Add"], sort_by="dump_size", reverse=True) check_main_menu(self, out, list_tensors_enabled=False) @@ -731,7 +742,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/u/read:0", "simple_mul_add/v/read:0", "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], - ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"], + ["VariableV2", "VariableV2", "Identity", "Identity", + matmul_prefix() + "MatMul", "Add"], sort_by="op_type", reverse=False) check_main_menu(self, out, list_tensors_enabled=False) @@ -746,7 +758,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/u/read:0", "simple_mul_add/v/read:0", "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], - ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"], + ["VariableV2", "VariableV2", "Identity", "Identity", + matmul_prefix() + "MatMul", "Add"], sort_by="op_type", reverse=True) check_main_menu(self, out, list_tensors_enabled=False) @@ -761,7 +774,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/u/read:0", "simple_mul_add/v/read:0", "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], - ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"], + ["VariableV2", "VariableV2", "Identity", "Identity", + matmul_prefix() + "MatMul", "Add"], sort_by="tensor_name", reverse=False) check_main_menu(self, out, list_tensors_enabled=False) @@ -776,7 +790,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/u/read:0", "simple_mul_add/v/read:0", "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], - ["VariableV2", "VariableV2", "Identity", "Identity", "MatMul", "Add"], + ["VariableV2", "VariableV2", "Identity", "Identity", + matmul_prefix() + "MatMul", "Add"], sort_by="tensor_name", reverse=True) check_main_menu(self, out, list_tensors_enabled=False) @@ -803,13 +818,13 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): ["Identity", "Identity"], op_type_regex="Identity") - out = self._registry.dispatch_command("list_tensors", - ["-t", "(Add|MatMul)"]) + out = self._registry.dispatch_command( + "list_tensors", ["-t", "(Add|" + matmul_prefix() + "MatMul)"]) assert_listed_tensors( self, out, ["simple_mul_add/add:0", "simple_mul_add/matmul:0"], - ["Add", "MatMul"], - op_type_regex="(Add|MatMul)") + ["Add", matmul_prefix() + "MatMul"], + op_type_regex=("(Add|" + matmul_prefix() + "MatMul)")) check_main_menu(self, out, list_tensors_enabled=False) def testListTensorFilterByNodeNameRegexAndOpTypeRegex(self): @@ -845,7 +860,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): assert_listed_tensors( self, out, ["simple_mul_add/matmul:0", "simple_mul_add/add:0"], - ["MatMul", "Add"], tensor_filter_name="is_2x1_vector") + [matmul_prefix() + "MatMul", "Add"], tensor_filter_name="is_2x1_vector") + check_main_menu(self, out, list_tensors_enabled=False) def testListTensorsFilterNanOrInf(self): @@ -884,7 +900,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): recipients = [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")] - assert_node_attribute_lines(self, out, node_name, "MatMul", + assert_node_attribute_lines(self, out, node_name, + matmul_prefix() + "MatMul", self._main_device, [("Identity", "simple_mul_add/u/read"), ("Identity", "simple_mul_add/v/read")], [], @@ -906,17 +923,21 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): node_name = "simple_mul_add/matmul" out = self._registry.dispatch_command("node_info", ["-a", node_name]) + test_attr_key_val_pairs = [("transpose_a", "b: false"), + ("transpose_b", "b: false"), + ("T", "type: DT_DOUBLE")] + if test_util.IsMklEnabled(): + test_attr_key_val_pairs.append(("_kernel", 's: "MklNameChangeOp"')) + assert_node_attribute_lines( self, out, node_name, - "MatMul", + matmul_prefix() + "MatMul", self._main_device, [("Identity", "simple_mul_add/u/read"), ("Identity", "simple_mul_add/v/read")], [], [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [], - attr_key_val_pairs=[("transpose_a", "b: false"), - ("transpose_b", "b: false"), - ("T", "type: DT_DOUBLE")]) + attr_key_val_pairs=test_attr_key_val_pairs) check_main_menu( self, out, @@ -933,7 +954,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): self, out, node_name, - "MatMul", + matmul_prefix() + "MatMul", self._main_device, [("Identity", "simple_mul_add/u/read"), ("Identity", "simple_mul_add/v/read")], [], [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [], @@ -959,7 +980,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): self, out, node_name, - "MatMul", + matmul_prefix() + "MatMul", self._main_device, [("Identity", "simple_mul_add/u/read"), ("Identity", "simple_mul_add/v/read")], [], [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [], @@ -982,7 +1003,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): self, out, node_name, - "MatMul", + matmul_prefix() + "MatMul", self._main_device, [("Identity", "simple_mul_add/u/read"), ("Identity", "simple_mul_add/v/read")], [], [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [], @@ -1003,7 +1024,8 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): assert_node_attribute_lines(self, out, node_name, "Identity", self._main_device, [("VariableV2", "simple_mul_add/u")], [], - [("MatMul", "simple_mul_add/matmul")], []) + [(matmul_prefix() + "MatMul", + "simple_mul_add/matmul")], []) check_main_menu( self, out, From 5e5b01c91415b6460510264ad3408a7fe9d1c628 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 11:31:40 -0700 Subject: [PATCH 0480/3053] Fixed formatting of ctc_loss_v2 docstring. PiperOrigin-RevId: 259781576 --- tensorflow/python/ops/ctc_ops.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py index 22a8c95431c..a1d75f61fa2 100644 --- a/tensorflow/python/ops/ctc_ops.py +++ b/tensorflow/python/ops/ctc_ops.py @@ -615,18 +615,16 @@ def ctc_loss_v2(labels, pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf) Notes: - - Same as the "Classic CTC" in TensorFlow 1.x's tf.compat.v1.nn.ctc_loss - setting of - preprocess_collapse_repeated=False, ctc_merge_repeated=True - - Labels may be supplied as either a dense, zero-padded tensor with a - vector of label sequence lengths OR as a SparseTensor. - - On TPU and GPU: - - Only dense padded labels are supported. - - On CPU: - - Caller may use SparseTensor or dense padded labels but calling with - a SparseTensor will be significantly faster. - - Default blank label is 0 rather num_classes - 1, unless overridden by - blank_index. + + - Same as the "Classic CTC" in TensorFlow 1.x's tf.compat.v1.nn.ctc_loss + setting of preprocess_collapse_repeated=False, ctc_merge_repeated=True + - Labels may be supplied as either a dense, zero-padded tensor with a + vector of label sequence lengths OR as a SparseTensor. + - On TPU and GPU: Only dense padded labels are supported. + - On CPU: Caller may use SparseTensor or dense padded labels but calling with + a SparseTensor will be significantly faster. + - Default blank label is 0 rather num_classes - 1, unless overridden by + blank_index. Args: labels: tensor of shape [batch_size, max_label_seq_length] or SparseTensor From 7251a1efe4eec620414765a098f1bacbee97dd30 Mon Sep 17 00:00:00 2001 From: Scott Zhu Date: Wed, 24 Jul 2019 11:34:50 -0700 Subject: [PATCH 0481/3053] Fix partial batch issue for the numpy data in training_v2. 1. Update the data adapter to include the final partial batch information if it is known. 2. Update training_v2 to aggregate based on number of example rather than steps when there is a known partial batch. The callback/progress bar will also use that in a followup cl. PiperOrigin-RevId: 259782295 --- .../python/keras/engine/data_adapter.py | 25 +++++++++++++ .../python/keras/engine/data_adapter_test.py | 5 +++ tensorflow/python/keras/engine/training_v2.py | 37 +++++++++++++++---- 3 files changed, 60 insertions(+), 7 deletions(-) diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py index 87815772bd9..bd29560dfbe 100644 --- a/tensorflow/python/keras/engine/data_adapter.py +++ b/tensorflow/python/keras/engine/data_adapter.py @@ -152,6 +152,14 @@ class DataAdapter(object): """Whether the dataset has partial batch at the end.""" raise NotImplementedError + @abc.abstractmethod + def partial_batch_size(self): + """The size of the final partial batch for dataset. + + Will return None if has_partial_batch is False or batch_size is None. + """ + raise NotImplementedError + class TensorLikeDataAdapter(DataAdapter): """Adapter that handles Tensor-like objects, e.g. EagerTensor and NumPy.""" @@ -196,6 +204,11 @@ class TensorLikeDataAdapter(DataAdapter): self._size = 1 self._batch_size = num_samples self._has_partial_batch = False + self._partial_batch_size = None + if self._has_partial_batch: + self._partial_batch_size = ( + num_samples - (self._size - 1) * self._batch_size) + self._dataset = dataset def get_dataset(self): @@ -210,6 +223,9 @@ class TensorLikeDataAdapter(DataAdapter): def has_partial_batch(self): return self._has_partial_batch + def partial_batch_size(self): + return self._partial_batch_size + class DatasetAdapter(DataAdapter): """Adapter that handles `tf.data.Dataset`.""" @@ -243,6 +259,9 @@ class DatasetAdapter(DataAdapter): def has_partial_batch(self): return False + def partial_batch_size(self): + return None + class GeneratorDataAdapter(DataAdapter): """Adapter that handles python generator.""" @@ -288,6 +307,9 @@ class GeneratorDataAdapter(DataAdapter): def has_partial_batch(self): return False + def partial_batch_size(self): + return None + class KerasSequenceAdapter(DataAdapter): """Adapter that handles `keras.utils.Sequence`.""" @@ -331,6 +353,9 @@ class KerasSequenceAdapter(DataAdapter): def has_partial_batch(self): return False + def partial_batch_size(self): + return None + ALL_ADAPTER_CLS = [ TensorLikeDataAdapter, DatasetAdapter, GeneratorDataAdapter, diff --git a/tensorflow/python/keras/engine/data_adapter_test.py b/tensorflow/python/keras/engine/data_adapter_test.py index 97bd4b018a9..5564e6c02f9 100644 --- a/tensorflow/python/keras/engine/data_adapter_test.py +++ b/tensorflow/python/keras/engine/data_adapter_test.py @@ -102,6 +102,7 @@ class TensorLikeDataAdapterTest(DataAdapterTestBase): self.numpy_input, self.numpy_target, batch_size=4) self.assertEqual(adapter.get_size(), 13) # 50/4 self.assertTrue(adapter.has_partial_batch()) + self.assertEqual(adapter.partial_batch_size(), 2) def test_training_numpy(self): dataset = self.adapter_cls( @@ -140,6 +141,7 @@ class TensorLikeDataAdapterTest(DataAdapterTestBase): self.tensor_input, self.tensor_target, batch_size=4) self.assertEqual(adapter.get_size(), 13) # 50/4 self.assertTrue(adapter.has_partial_batch()) + self.assertEqual(adapter.partial_batch_size(), 2) class DatasetAdapterTest(DataAdapterTestBase): @@ -171,6 +173,7 @@ class DatasetAdapterTest(DataAdapterTestBase): def test_partial_batch(self): adapter = self.adapter_cls(self.dataset_input) self.assertFalse(adapter.has_partial_batch()) + self.assertIsNone(adapter.partial_batch_size()) class GeneratorDataAdapterTest(DataAdapterTestBase): @@ -202,6 +205,7 @@ class GeneratorDataAdapterTest(DataAdapterTestBase): def test_partial_batch(self): adapter = self.adapter_cls(self.generator_input) self.assertFalse(adapter.has_partial_batch()) + self.assertIsNone(adapter.partial_batch_size()) class KerasSequenceAdapterTest(DataAdapterTestBase): @@ -233,6 +237,7 @@ class KerasSequenceAdapterTest(DataAdapterTestBase): def test_partial_batch(self): adapter = self.adapter_cls(self.sequence_input) self.assertFalse(adapter.has_partial_batch()) + self.assertIsNone(adapter.partial_batch_size()) if __name__ == '__main__': diff --git a/tensorflow/python/keras/engine/training_v2.py b/tensorflow/python/keras/engine/training_v2.py index 6e789ccd73c..7e89312d891 100644 --- a/tensorflow/python/keras/engine/training_v2.py +++ b/tensorflow/python/keras/engine/training_v2.py @@ -61,7 +61,8 @@ def run_one_epoch(model, steps_per_epoch=None, mode=ModeKeys.TRAIN, training_context=None, - total_epochs=None): + total_epochs=None, + partical_batch_size=None): """Run the execution function with the data from iterator. Given the dataset iterator and execution function, get the data from iterator @@ -81,15 +82,26 @@ def run_one_epoch(model, total_epochs: the total number of epochs that will be run. Used when throw error when the iterator unexpectedly reaches its end. + partical_batch_size: the size of the final batch if it is already known. It + will be used to scale the loss value for the final batch. Returns: The loss and metric value from the model. """ + # Only use the sample to count if there is a partial batch at the end. + use_steps = not (partical_batch_size and batch_size and steps_per_epoch and + steps_per_epoch == dataset_size) + num_samples = None if use_steps else batch_size * (steps_per_epoch - + 1) + partical_batch_size + if mode == ModeKeys.PREDICT: aggregator = training_utils.OutputsAggregator( - use_steps=True, steps=steps_per_epoch, batch_size=batch_size) + use_steps=use_steps, + steps=steps_per_epoch, + num_samples=num_samples, + batch_size=batch_size) else: aggregator = training_utils.MetricsAggregator( - use_steps=True, steps=steps_per_epoch) + use_steps=use_steps, steps=steps_per_epoch, num_samples=num_samples) callbacks = training_context.callbacks progbar = training_context.progbar @@ -143,7 +155,14 @@ def run_one_epoch(model, if step == 0: aggregator.create(batch_outs) - aggregator.aggregate(batch_outs) + + if use_steps: + aggregator.aggregate(batch_outs) + else: + aggregator.aggregate( + batch_outs, + batch_start=step * batch_size, + batch_end=min((step + 1) * batch_size, num_samples)) cbks.make_logs(model, batch_logs, batch_outs, mode) training_context.callbacks._call_batch_hook( @@ -286,7 +305,8 @@ class Loop(training_utils.TrainingLoop): steps_per_epoch=steps_per_epoch, mode=ModeKeys.TRAIN, training_context=training_context, - total_epochs=epochs) + total_epochs=epochs, + partical_batch_size=training_data_adapter.partial_batch_size()) cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN) # Evaluation @@ -316,7 +336,9 @@ class Loop(training_utils.TrainingLoop): steps_per_epoch=validation_steps, mode=ModeKeys.TEST, training_context=eval_context, - total_epochs=1) + total_epochs=1, + partical_batch_size=validation_adapter.partial_batch_size( + )) cbks.make_logs(model, epoch_logs, eval_result, ModeKeys.TEST, prefix='val_') @@ -389,7 +411,8 @@ class Loop(training_utils.TrainingLoop): steps_per_epoch=steps, mode=mode, training_context=training_context, - total_epochs=1) + total_epochs=1, + partical_batch_size=adapter.partial_batch_size()) cbks.make_logs(model, epoch_logs, result, mode) if len(result) == 1: From d0eeef269d1f18961c6cd0a8d80ede564626fbc3 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 11:36:35 -0700 Subject: [PATCH 0482/3053] Use tf.function's default autograph=True in saved_model/integration_test. There is currently no demonstrable need to do something non-obvious here. PiperOrigin-RevId: 259782705 --- .../examples/saved_model/integration_tests/export_mnist_cnn.py | 2 +- .../examples/saved_model/integration_tests/export_rnn_cell.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py b/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py index 74981b5fbf7..6b94fda0f34 100644 --- a/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py +++ b/tensorflow/examples/saved_model/integration_tests/export_mnist_cnn.py @@ -117,7 +117,7 @@ def wrap_keras_model_for_export(model, batch_input_shape, # the desired argspec. def wrapped(*args, **kwargs): # TODO(arnoegw): Can we use call_fn itself? return call_fn(*args, **kwargs) - traced_call_fn = tf.function(autograph=False)( + traced_call_fn = tf.function( tf_decorator.make_decorator(call_fn, wrapped, decorator_argspec=argspec)) # Now we need to trigger traces for all supported combinations of the diff --git a/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py b/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py index 876e3004bca..6a2853f0617 100644 --- a/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py +++ b/tensorflow/examples/saved_model/integration_tests/export_rnn_cell.py @@ -37,7 +37,7 @@ def main(argv): root.rnn_cell = tf.keras.layers.LSTMCell(units=10, recurrent_initializer=None) # Wrap the rnn_cell.__call__ function and assign to next_state. - root.next_state = tf.function(root.rnn_cell.__call__, autograph=False) + root.next_state = tf.function(root.rnn_cell.__call__) # Wrap the rnn_cell.get_initial_function using a decorator and assign to an # attribute with the same name. From af27231dc3a44643aad57374ead201a06b3d72a6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 11:36:41 -0700 Subject: [PATCH 0483/3053] TraceUsingAnnotations is not required, ScopedAnnotation are enabled in device-sepcific tracer. therefore no need to check device before check ScopedAnnotation::IsEnabled(). Also calling virtual function is slower than atomic::load. proof attached. PiperOrigin-RevId: 259782723 --- tensorflow/core/common_runtime/device.h | 6 --- .../common_runtime/eager/kernel_and_device.cc | 38 ++++++------------- tensorflow/core/common_runtime/executor.cc | 32 +++++----------- .../core/common_runtime/gpu/gpu_device.h | 5 --- 4 files changed, 22 insertions(+), 59 deletions(-) diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h index e25bd06c17e..c8db4a03f91 100644 --- a/tensorflow/core/common_runtime/device.h +++ b/tensorflow/core/common_runtime/device.h @@ -103,12 +103,6 @@ class Device : public DeviceBase { } } - // If true, and tracing is enabled, the `tracing::ScopedAnnotation()` tracing - // mechanism will be used instead of `tracing::ScopedActivity()`. Some devices - // may override this method to use annotations, which enable child activities - // (such as GPU kernel launches) to be related to the OpKernel invocation. - virtual bool TraceUsingAnnotations() const { return false; } - // Blocks until all operations queued on the device at the time of // the call have completed. Returns any error pending on the device // at completion. diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc index 3492ddf7781..07c7ef28af0 100644 --- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc +++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc @@ -313,32 +313,18 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container, done.WaitForNotification(); } else { const string& op_name = kernel_->name(); - // If tracing if off, the overheads of ScopedAnnotation and TraceMe - // are negligible. - if (device_->TraceUsingAnnotations()) { - // 'ScopedActivity' will trace the OpKernel scheduling time on host. - profiler::TraceMe activity( - [&] { - return absl::StrCat(op_name, ":", kernel_->type_string(), "#id=", - step_container ? step_container->step_id() : 0, - ",device=", device_->name(), ",async=false#"); - }, - profiler::TraceMeLevel::kInfo); - // 'ScopedAnnotation' will trace the OpKernel execution time on device. - tracing::ScopedAnnotation annotation( - [&]() { return absl::StrCat(op_name, ":", kernel_->type_string()); }); - device_->Compute(kernel_.get(), &context); - } else { - profiler::TraceMe activity( - [&] { - return strings::StrCat( - op_name, ":", kernel_->type_string(), - "#id=", step_container ? step_container->step_id() : 0, - ",device=", device_->name(), ",async=false#"); - }, - profiler::TraceMeLevel::kInfo); - device_->Compute(kernel_.get(), &context); - } + // 'ScopedActivity' will trace the OpKernel scheduling time on host. + profiler::TraceMe activity( + [&] { + return absl::StrCat(op_name, ":", kernel_->type_string(), "#id=", + step_container ? step_container->step_id() : 0, + ",device=", device_->name(), ",async=false#"); + }, + profiler::TraceMeLevel::kInfo); + // 'ScopedAnnotation' will trace the OpKernel execution time on device. + tracing::ScopedAnnotation annotation( + [&]() { return absl::StrCat(op_name, ":", kernel_->type_string()); }); + device_->Compute(kernel_.get(), &context); } // Clean up execution op_execution_state if deferred ops aren't running. diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc index bc0609e04e2..0be4394b985 100644 --- a/tensorflow/core/common_runtime/executor.cc +++ b/tensorflow/core/common_runtime/executor.cc @@ -1272,7 +1272,6 @@ class ExecutorState { std::unique_ptr user_device_; Executor::Args::Runner runner_; bool sync_on_finish_; - const bool trace_using_annotations_; // Owned. @@ -1405,7 +1404,6 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl) cancellation_manager_(args.cancellation_manager), runner_(args.runner), sync_on_finish_(args.sync_on_finish), - trace_using_annotations_(impl->params_.device->TraceUsingAnnotations()), num_outstanding_ops_(0) { if (args.user_intra_op_threadpool != nullptr) { Device* device = impl_->params_.device; @@ -1600,8 +1598,7 @@ struct ExecutorState::AsyncState { // Returns true if `item` might be traced by the given trace and event // collectors. Returns false only if `item` definitely will not be traced. bool MightTrace(const NodeItem& item, - const tracing::EventCollector* event_collector, - bool using_annotations) { + const tracing::EventCollector* event_collector) { // Tracing will only be enabled if either `event_collector` is non null, // or `trace_collector` is non-null and enabled for this particular kernel. // Although `profiler::TraceMe`, `tracing::ScopedAnnotation`, and @@ -1613,7 +1610,7 @@ bool MightTrace(const NodeItem& item, return true; } - if (using_annotations && tracing::ScopedAnnotation::IsEnabled()) return true; + if (tracing::ScopedAnnotation::IsEnabled()) return true; return profiler::TraceMeRecorder::Active( profiler::GetTFTraceMeLevel(item.kernel->IsExpensive())); @@ -1829,8 +1826,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) { OpKernelContext ctx(¶ms, item.num_outputs); nodestats::SetOpStart(stats); - if (TF_PREDICT_FALSE( - MightTrace(item, event_collector_, trace_using_annotations_))) { + if (TF_PREDICT_FALSE(MightTrace(item, event_collector_))) { const string& op_name = op_kernel->name(); const string kernel_label = strings::StrCat( op_name, ":", op_kernel->type_string(), @@ -1838,21 +1834,13 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) { ",device=", device->name(), ",async=false#"); tracing::ScopedRegion region(tracing::EventCategory::kCompute, op_name); - if (trace_using_annotations_) { - // 'TraceMe' will trace the OpKernel scheduling time. - profiler::TraceMe activity(absl::string_view(kernel_label), - profiler::TraceMeLevel::kInfo); - // 'ScopedAnnotation' will trace the OpKernel execution time. - tracing::ScopedAnnotation annotation(kernel_label); - device->Compute(op_kernel, &ctx); - } else { - // Use the cheaper `TraceMe` to trace just the OpKernel - // execution. - profiler::TraceMe activity( - absl::string_view(kernel_label), - profiler::GetTFTraceMeLevel(op_kernel->IsExpensive())); - device->Compute(op_kernel, &ctx); - } + // 'TraceMe' will trace the OpKernel scheduling time. + profiler::TraceMe activity( + absl::string_view(kernel_label), + profiler::GetTFTraceMeLevel(op_kernel->IsExpensive())); + // 'ScopedAnnotation' will trace the OpKernel execution time. + tracing::ScopedAnnotation annotation(kernel_label); + device->Compute(op_kernel, &ctx); } else { // In the common case, avoid creating any tracing objects. if (op_kernel->IsExpensive()) { diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h index 2dc775c337a..cbba89d0d05 100644 --- a/tensorflow/core/common_runtime/gpu/gpu_device.h +++ b/tensorflow/core/common_runtime/gpu/gpu_device.h @@ -67,11 +67,6 @@ class BaseGPUDevice : public LocalDevice { // completes. bool RequiresRecordingAccessedTensors() const override; - // GPU kernel execution requires us to use `tracing::ScopedAnnotation()` - // rather than `tracing::ScopedActivity()`, in order to relate asynchronously - // launched GPU kernels to the OpKernel. - bool TraceUsingAnnotations() const { return true; } - void ConsumeListOfAccessedTensors( DeviceContext* device_context, const TensorReferenceVector& tensor_refs) override; From a5548b54eeb8270a05cfca2da3816f2e56853509 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 11:41:54 -0700 Subject: [PATCH 0484/3053] Add option for controlling import logic in API generator. PiperOrigin-RevId: 259783795 --- .../python/tools/api/generator/api_gen.bzl | 4 +- .../tools/api/generator/create_python_api.py | 112 +++++++++++------- 2 files changed, 75 insertions(+), 41 deletions(-) diff --git a/tensorflow/python/tools/api/generator/api_gen.bzl b/tensorflow/python/tools/api/generator/api_gen.bzl index 234addaf782..71610d3574b 100644 --- a/tensorflow/python/tools/api/generator/api_gen.bzl +++ b/tensorflow/python/tools/api/generator/api_gen.bzl @@ -92,6 +92,8 @@ def gen_api_init_files( " --compat_init_template=$(location %s)" % compat_init_template ) + loading_flag = " --loading=default" + native.genrule( name = name, outs = all_output_files, @@ -100,7 +102,7 @@ def gen_api_init_files( root_init_template_flag + " --apidir=$(@D)" + output_dir + " --apiname=" + api_name + " --apiversion=" + str(api_version) + compat_api_version_flags + " " + compat_init_template_flags + - " --package=" + ",".join(packages) + + loading_flag + " --package=" + ",".join(packages) + " --output_package=" + output_package + " $(OUTS)" ), srcs = srcs, diff --git a/tensorflow/python/tools/api/generator/create_python_api.py b/tensorflow/python/tools/api/generator/create_python_api.py index a8a1c760637..98cd159a63f 100644 --- a/tensorflow/python/tools/api/generator/create_python_api.py +++ b/tensorflow/python/tools/api/generator/create_python_api.py @@ -75,34 +75,6 @@ class SymbolExposedTwiceError(Exception): pass -def format_import(source_module_name, source_name, dest_name): - """Formats import statement. - - Args: - source_module_name: (string) Source module to import from. - source_name: (string) Source symbol name to import. - dest_name: (string) Destination alias name. - - Returns: - An import statement string. - """ - if _LAZY_LOADING: - return " '%s': ('%s', '%s')," % (dest_name, source_module_name, - source_name) - else: - if source_module_name: - if source_name == dest_name: - return 'from %s import %s' % (source_module_name, source_name) - else: - return 'from %s import %s as %s' % (source_module_name, source_name, - dest_name) - else: - if source_name == dest_name: - return 'import %s' % source_name - else: - return 'import %s as %s' % (source_name, dest_name) - - def get_canonical_import(import_set): """Obtain one single import from a set of possible sources of a symbol. @@ -133,7 +105,7 @@ def get_canonical_import(import_set): class _ModuleInitCodeBuilder(object): """Builds a map from module name to imports included in that module.""" - def __init__(self, output_package, api_version): + def __init__(self, output_package, api_version, lazy_loading=_LAZY_LOADING): self._output_package = output_package # Maps API module to API symbol name to set of tuples of the form # (module name, priority). @@ -145,6 +117,9 @@ class _ModuleInitCodeBuilder(object): # Names that start with underscore in the root module. self._underscore_names_in_root = [] self._api_version = api_version + # Controls whether or not exported symbols are lazily loaded or statically + # imported. + self._lazy_loading = lazy_loading def _check_already_imported(self, symbol_id, api_name): if (api_name in self._dest_import_to_id and @@ -171,7 +146,7 @@ class _ModuleInitCodeBuilder(object): SymbolExposedTwiceError: Raised when an import with the same dest_name has already been added to dest_module_name. """ - import_str = format_import(source_module_name, source_name, dest_name) + import_str = self.format_import(source_module_name, source_name, dest_name) # Check if we are trying to expose two different symbols with same name. full_api_name = dest_name @@ -211,7 +186,7 @@ class _ModuleInitCodeBuilder(object): submodule = module_split[submodule_index-1] parent_module += '.' + submodule if parent_module else submodule import_from = self._output_package - if _LAZY_LOADING: + if self._lazy_loading: import_from += '.' + '.'.join(module_split[:submodule_index + 1]) self.add_import( symbol=None, @@ -247,7 +222,7 @@ class _ModuleInitCodeBuilder(object): get_canonical_import(imports) for _, imports in dest_name_to_imports.items() ] - if _LAZY_LOADING: + if self._lazy_loading: module_text_map[ dest_module] = _LAZY_LOADING_MODULE_TEXT_TEMPLATE % '\n'.join( sorted(imports_list)) @@ -258,7 +233,7 @@ class _ModuleInitCodeBuilder(object): # from it using * import. Don't need this for lazy_loading because the # underscore symbols are already included in __all__ when passed in and # handled by TFModuleWrapper. - if not _LAZY_LOADING: + if not self._lazy_loading: underscore_names_str = ', '.join( '\'%s\'' % name for name in self._underscore_names_in_root) @@ -275,9 +250,10 @@ __all__.extend([_s for _s in _names_with_underscore]) if not dest_module.startswith(_COMPAT_MODULE_PREFIX): deprecation = 'True' # Workaround to make sure not load lite from lite/__init__.py - if not dest_module and 'lite' in self._module_imports and _LAZY_LOADING: + if (not dest_module and 'lite' in self._module_imports + and self._lazy_loading): has_lite = 'True' - if _LAZY_LOADING: + if self._lazy_loading: public_apis_name = '_PUBLIC_APIS' else: public_apis_name = 'None' @@ -286,6 +262,33 @@ __all__.extend([_s for _s in _names_with_underscore]) return module_text_map, footer_text_map + def format_import(self, source_module_name, source_name, dest_name): + """Formats import statement. + + Args: + source_module_name: (string) Source module to import from. + source_name: (string) Source symbol name to import. + dest_name: (string) Destination alias name. + + Returns: + An import statement string. + """ + if self._lazy_loading: + return " '%s': ('%s', '%s')," % (dest_name, source_module_name, + source_name) + else: + if source_module_name: + if source_name == dest_name: + return 'from %s import %s' % (source_module_name, source_name) + else: + return 'from %s import %s as %s' % (source_module_name, source_name, + dest_name) + else: + if source_name == dest_name: + return 'import %s' % source_name + else: + return 'import %s as %s' % (source_name, dest_name) + def _get_name_and_module(full_name): """Split full_name into module and short name. @@ -368,7 +371,8 @@ def get_api_init_text(packages, output_package, api_name, api_version, - compat_api_versions=None): + compat_api_versions=None, + lazy_loading=_LAZY_LOADING): """Get a map from destination module to __init__.py code for that module. Args: @@ -380,6 +384,8 @@ def get_api_init_text(packages, api_version: API version you want to generate (1 or 2). compat_api_versions: Additional API versions to generate under compat/ directory. + lazy_loading: Boolean flag. If True, a lazy loading `__init__.py` file is + produced and if `False`, static imports are used. Returns: A dictionary where @@ -389,7 +395,8 @@ def get_api_init_text(packages, """ if compat_api_versions is None: compat_api_versions = [] - module_code_builder = _ModuleInitCodeBuilder(output_package, api_version) + module_code_builder = _ModuleInitCodeBuilder( + output_package, api_version, lazy_loading) # Traverse over everything imported above. Specifically, # we want to traverse over TensorFlow Python modules. @@ -491,7 +498,8 @@ def get_module_docstring(module_name, package, api_name): def create_api_files(output_files, packages, root_init_template, output_dir, output_package, api_name, api_version, - compat_api_versions, compat_init_templates): + compat_api_versions, compat_init_templates, + lazy_loading=_LAZY_LOADING): """Creates __init__.py files for the Python API. Args: @@ -509,6 +517,8 @@ def create_api_files(output_files, packages, root_init_template, output_dir, subdirectory. compat_init_templates: List of templates for top level compat init files in the same order as compat_api_versions. + lazy_loading: Boolean flag. If True, a lazy loading `__init__.py` file is + produced and if `False`, static imports are used. Raises: ValueError: if output_files list is missing a required file. @@ -526,7 +536,7 @@ def create_api_files(output_files, packages, root_init_template, output_dir, module_text_map, deprecation_footer_map = get_api_init_text( packages, output_package, api_name, - api_version, compat_api_versions) + api_version, compat_api_versions, lazy_loading) # Add imports to output files. missing_output_files = [] @@ -621,6 +631,14 @@ def main(): parser.add_argument( '--output_package', default='tensorflow', type=str, help='Root output package.') + parser.add_argument( + '--loading', default='default', type=str, + choices=['lazy', 'static', 'default'], + help='Controls how the generated __init__.py file loads the exported ' + 'symbols. \'lazy\' means the symbols are loaded when first used. ' + '\'static\' means all exported symbols are loaded in the ' + '__init__.py file. \'default\' uses the value of the ' + '_LAZY_LOADING constant in create_python_api.py.') args = parser.parse_args() if len(args.outputs) == 1: @@ -635,9 +653,23 @@ def main(): packages = args.packages.split(',') for package in packages: importlib.import_module(package) + + # Determine if the modules shall be loaded lazily or statically. + if args.loading == 'default': + lazy_loading = _LAZY_LOADING + elif args.loading == 'lazy': + lazy_loading = True + elif args.loading == 'static': + lazy_loading = False + else: + # This should never happen (tm). + raise ValueError('Invalid value for --loading flag: %s. Must be one of ' + 'lazy, static, default.' % args.loading) + create_api_files(outputs, packages, args.root_init_template, args.apidir, args.output_package, args.apiname, args.apiversion, - args.compat_apiversions, args.compat_init_templates) + args.compat_apiversions, args.compat_init_templates, + lazy_loading) if __name__ == '__main__': From ad779e54612501f53a68acb7482e4d7448f81e08 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 11:51:29 -0700 Subject: [PATCH 0485/3053] Switch to a shorter representation for attributes containing protobuf strings Switch from serializing using DebugString() to ShortDebugString() when creating attributes. This avoids excess whitespace and results in shorter representation. PiperOrigin-RevId: 259785766 --- .../compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt | 4 ++-- .../tests/graphdef2mlir/graph-empty-tensor-content.pbtxt | 2 +- .../tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt | 6 +++--- .../mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt | 2 +- tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt index ffbd84c7ee7..c9df1f2ec6c 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/add.pbtxt @@ -38,8 +38,8 @@ versions { # CHECK: func @main(%arg0: tensor<10xi32>, %arg1: tensor<10xi32>) -> tensor<10xi32> # CHECK: attributes {tf.entry_function = {inputs = "input0, input1", outputs = "Add"}} { -# CHECK: %0:2 = "_tf.Placeholder.input"(%arg0) {device = "", dtype = "tfdtype$DT_INT32", name = "input0", shape = "tfshape$dim {\0A size: 10\0A}\0A"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) -# CHECK: %1:2 = "_tf.Placeholder.input"(%arg1) {device = "", dtype = "tfdtype$DT_INT32", name = "input1", shape = "tfshape$dim {\0A size: 10\0A}\0A"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) +# CHECK: %0:2 = "_tf.Placeholder.input"(%arg0) {device = "", dtype = "tfdtype$DT_INT32", name = "input0", shape = "tfshape$dim { size: 10 }"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) +# CHECK: %1:2 = "_tf.Placeholder.input"(%arg1) {device = "", dtype = "tfdtype$DT_INT32", name = "input1", shape = "tfshape$dim { size: 10 }"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) # CHECK: %2:2 = "_tf.Add"(%0#0, %1#0) {T = "tfdtype$DT_INT32", device = "", name = "Add"} : (tensor<10xi32>, tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) # CHECK: return %2#0 : tensor<10xi32> # CHECK: } diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt index de56712ca13..c023c7e6658 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt @@ -3,7 +3,7 @@ # This test is intended to verify the tensor_content field on import of an empty # tensor. # CHECK: tf.Const -# CHECK-SAME: value = opaque<"tf", "0x746674656E736F722464747970653A2044545F464C4F41540A74656E736F725F7368617065207B0A202064696D207B0A2020202073697A653A20310A20207D0A7D0A"> +# CHECK-SAME: value = opaque<"tf", "0x746674656E736F722464747970653A2044545F464C4F41542074656E736F725F7368617065207B2064696D207B2073697A653A2031207D207D"> node { name: "Const" diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt index a745cf302e9..f57a42ae287 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt @@ -69,10 +69,10 @@ versions { # CHECK: func @main(%arg0: tensor<10xi32>, %arg1: tensor<10xi32>, %arg2: tensor<10xi32>) -> tensor<10xi32> # CHECK-NEXT: attributes {tf.entry_function = {inputs = "input0, input1, unused_input", outputs = "Add"}} { -# CHECK-NEXT: %0:2 = "_tf.Placeholder.input"(%arg0) {device = "", dtype = "tfdtype$DT_INT32", name = "input0", shape = "tfshape$dim {\0A size: 10\0A}\0A"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Placeholder.input"(%arg1) {device = "", dtype = "tfdtype$DT_INT32", name = "input1", shape = "tfshape$dim {\0A size: 10\0A}\0A"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) +# CHECK-NEXT: %0:2 = "_tf.Placeholder.input"(%arg0) {device = "", dtype = "tfdtype$DT_INT32", name = "input0", shape = "tfshape$dim { size: 10 }"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) +# CHECK-NEXT: %1:2 = "_tf.Placeholder.input"(%arg1) {device = "", dtype = "tfdtype$DT_INT32", name = "input1", shape = "tfshape$dim { size: 10 }"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) # CHECK-NEXT: %2:2 = "_tf.Add"(%0#0, %1#0) {T = "tfdtype$DT_INT32", device = "", name = "Add"} : (tensor<10xi32>, tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) -# CHECK-NEXT: %3:2 = "_tf.Placeholder.input"(%arg2) {device = "", dtype = "tfdtype$DT_INT32", name = "unused_input", shape = "tfshape$dim {\0A size: 10\0A}\0A"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) +# CHECK-NEXT: %3:2 = "_tf.Placeholder.input"(%arg2) {device = "", dtype = "tfdtype$DT_INT32", name = "unused_input", shape = "tfshape$dim { size: 10 }"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) # CHECK-NEXT: return %2#0 : tensor<10xi32> # CHECK-NEXT: } diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt index 790fb0c7334..c6f0730070f 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt @@ -42,6 +42,6 @@ versions { } # CHECK: func @main() { -# CHECK-NEXT: %0:2 = "_tf.Const"() {_output_shapes = ["tfshape$dim {\0A size: 3\0A}\0A"], device = "", dtype = "tfdtype$DT_STRING", name = "save/SaveV2/shape_and_slices", value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E470A74656E736F725F7368617065207B0A202064696D207B0A2020202073697A653A20330A20207D0A7D0A737472696E675F76616C3A2022220A737472696E675F76616C3A2022220A737472696E675F76616C3A2022220A"> : tensor<3x!tf.string>} : () -> (tensor<3x!tf.string>, !_tf.control) +# CHECK-NEXT: %0:2 = "_tf.Const"() {_output_shapes = ["tfshape$dim { size: 3 }"], device = "", dtype = "tfdtype$DT_STRING", name = "save/SaveV2/shape_and_slices", value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E472074656E736F725F7368617065207B2064696D207B2073697A653A2033207D207D20737472696E675F76616C3A20222220737472696E675F76616C3A20222220737472696E675F76616C3A202222"> : tensor<3x!tf.string>} : () -> (tensor<3x!tf.string>, !_tf.control) # CHECK-NEXT: return # CHECK-NEXT: } diff --git a/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc index 776a7ac71b2..691caab526a 100644 --- a/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc +++ b/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc @@ -69,7 +69,7 @@ MangledKind GetMangledKind(absl::string_view str) { } string MangleShape(const TensorShapeProto& shape) { - return absl::StrCat(kTensorShapePrefix, shape.DebugString()); + return absl::StrCat(kTensorShapePrefix, shape.ShortDebugString()); } Status DemangleShape(absl::string_view str, TensorShapeProto* proto) { @@ -85,7 +85,7 @@ Status DemangleShape(absl::string_view str, TensorShapeProto* proto) { } string MangleTensor(const TensorProto& tensor) { - return absl::StrCat(kTensorPrefix, tensor.DebugString()); + return absl::StrCat(kTensorPrefix, tensor.ShortDebugString()); } Status DemangleTensor(absl::string_view str, TensorProto* proto) { From 9f0d8017a9188b17a115b0f64b097a611306d273 Mon Sep 17 00:00:00 2001 From: Edward Loper Date: Wed, 24 Jul 2019 11:58:47 -0700 Subject: [PATCH 0486/3053] Implement CompositeTensor support in nested_structure_coder.py PiperOrigin-RevId: 259787059 --- tensorflow/core/protobuf/struct.proto | 23 ++++- .../saved_model/nested_structure_coder.py | 66 +++++++++++++++ .../nested_structure_coder_test.py | 84 +++++++++++++++++++ 3 files changed, 172 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/protobuf/struct.proto b/tensorflow/core/protobuf/struct.proto index 55b9b520a89..48a97c9455d 100644 --- a/tensorflow/core/protobuf/struct.proto +++ b/tensorflow/core/protobuf/struct.proto @@ -56,6 +56,8 @@ message StructuredValue { tensorflow.DataType tensor_dtype_value = 32; // Represents a value for tf.TensorSpec. TensorSpecProto tensor_spec_value = 33; + // Represents a value for tf.TypeSpec. + TypeSpecProto type_spec_value = 34; // Represents a list of `Value`. ListValue list_value = 51; @@ -104,4 +106,23 @@ message TensorSpecProto { string name = 1; tensorflow.TensorShapeProto shape = 2; tensorflow.DataType dtype = 3; -}; +} + +// Represents a tf.TypeSpec +message TypeSpecProto { + enum TypeSpecClass { + UNKNOWN = 0; + SPARSE_TENSOR_SPEC = 1; // tf.SparseTensorSpec + INDEXED_SLICES_SPEC = 2; // tf.IndexedSlicesSpec + RAGGED_TENSOR_SPEC = 3; // tf.RaggedTensorSpec + TENSOR_ARRAY_SPEC = 4; // tf.TensorArraySpec + DATA_DATASET_SPEC = 5; // tf.data.DatasetSpec + DATA_ITERATOR_SPEC = 6; // IteratorSpec from data/ops/iterator_ops.py + OPTIONAL_SPEC = 7; // tf.OptionalSpec + PER_REPLICA_SPEC = 8; // PerReplicaSpec from distribute/values.py + } + TypeSpecClass type_spec_class = 1; + + // The value returned by TypeSpec._serialize(). + StructuredValue type_state = 2; +} diff --git a/tensorflow/python/saved_model/nested_structure_coder.py b/tensorflow/python/saved_model/nested_structure_coder.py index 66b02b119d1..ae6c737327f 100644 --- a/tensorflow/python/saved_model/nested_structure_coder.py +++ b/tensorflow/python/saved_model/nested_structure_coder.py @@ -35,9 +35,17 @@ import functools import six from tensorflow.core.protobuf import struct_pb2 +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.data.ops import iterator_ops +from tensorflow.python.data.ops import optional_ops +from tensorflow.python.distribute import values from tensorflow.python.framework import dtypes +from tensorflow.python.framework import indexed_slices +from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import tensor_spec +from tensorflow.python.ops import tensor_array_ops +from tensorflow.python.ops.ragged import ragged_tensor from tensorflow.python.util import compat @@ -435,3 +443,61 @@ class _TensorSpecCodec(object): StructureCoder.register_codec(_TensorSpecCodec()) + + +class _TypeSpecCodec(object): + """Codec for `tf.TypeSpec`.""" + + # Mapping from enum value to type (TypeSpec subclass). + TYPE_SPEC_CLASS_FROM_PROTO = { + struct_pb2.TypeSpecProto.SPARSE_TENSOR_SPEC: + sparse_tensor.SparseTensorSpec, + struct_pb2.TypeSpecProto.INDEXED_SLICES_SPEC: + indexed_slices.IndexedSlicesSpec, + struct_pb2.TypeSpecProto.RAGGED_TENSOR_SPEC: + ragged_tensor.RaggedTensorSpec, + struct_pb2.TypeSpecProto.TENSOR_ARRAY_SPEC: + tensor_array_ops.TensorArraySpec, + struct_pb2.TypeSpecProto.DATA_DATASET_SPEC: + dataset_ops.DatasetSpec, + struct_pb2.TypeSpecProto.DATA_ITERATOR_SPEC: + iterator_ops.IteratorSpec, + struct_pb2.TypeSpecProto.OPTIONAL_SPEC: + optional_ops.OptionalSpec, + struct_pb2.TypeSpecProto.PER_REPLICA_SPEC: + values.PerReplicaSpec, + } + + # Mapping from type (TypeSpec subclass) to enum value. + TYPE_SPEC_CLASS_TO_PROTO = dict( + (cls, enum) for (enum, cls) in TYPE_SPEC_CLASS_FROM_PROTO.items()) + + def can_encode(self, pyobj): + # pylint: disable=unidiomatic-typecheck + return type(pyobj) in self.TYPE_SPEC_CLASS_TO_PROTO + + def do_encode(self, type_spec_value, encode_fn): + """Returns an encoded proto for the given `tf.TypeSpec`.""" + type_spec_class = self.TYPE_SPEC_CLASS_TO_PROTO[type(type_spec_value)] + type_state = type_spec_value._serialize() # pylint: disable=protected-access + encoded_type_spec = struct_pb2.StructuredValue() + encoded_type_spec.type_spec_value.CopyFrom( + struct_pb2.TypeSpecProto( + type_spec_class=type_spec_class, type_state=encode_fn(type_state))) + return encoded_type_spec + + def can_decode(self, value): + return ( + value.HasField("type_spec_value") and + value.type_spec_value.type_spec_class in self.TYPE_SPEC_CLASS_FROM_PROTO + ) + + def do_decode(self, value, decode_fn): + type_spec_proto = value.type_spec_value + type_spec_class_enum = type_spec_proto.type_spec_class + type_spec_class = self.TYPE_SPEC_CLASS_FROM_PROTO[type_spec_class_enum] + # pylint: disable=protected-access + return type_spec_class._deserialize(decode_fn(type_spec_proto.type_state)) + + +StructureCoder.register_codec(_TypeSpecCodec()) diff --git a/tensorflow/python/saved_model/nested_structure_coder_test.py b/tensorflow/python/saved_model/nested_structure_coder_test.py index 16c56b1ddbf..41d61d8cc08 100644 --- a/tensorflow/python/saved_model/nested_structure_coder_test.py +++ b/tensorflow/python/saved_model/nested_structure_coder_test.py @@ -20,10 +20,14 @@ from __future__ import print_function import collections +from google.protobuf import text_format from tensorflow.core.protobuf import struct_pb2 +from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import dtypes +from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import tensor_spec +from tensorflow.python.ops.ragged import ragged_tensor from tensorflow.python.platform import test from tensorflow.python.saved_model import nested_structure_coder @@ -187,6 +191,86 @@ class NestedStructureTest(test.TestCase): decoded = self._coder.decode_proto(encoded) self.assertEqual(structure, decoded) + def testEncodeDecodeRaggedTensorSpec(self): + structure = [ragged_tensor.RaggedTensorSpec( + [1, 2, 3], dtypes.int64, 2, dtypes.int32)] + self.assertTrue(self._coder.can_encode(structure)) + encoded = self._coder.encode_structure(structure) + expected_pbtxt = r""" + list_value { + values { + type_spec_value { + type_spec_class: RAGGED_TENSOR_SPEC + type_state { + tuple_value { + # spec._shape + values { + tensor_shape_value { + dim { size: 1 } + dim { size: 2 } + dim { size: 3 } + } + } + # spec._dtype + values { tensor_dtype_value: DT_INT64 } + # spec._ragged_rank + values { int64_value: 2 } + # spec._row_splits_dtype + values { tensor_dtype_value: DT_INT32 } + } + } + } + } + } + """ + expected = struct_pb2.StructuredValue() + text_format.Parse(expected_pbtxt, expected) + self.assertEqual(expected, encoded) + decoded = self._coder.decode_proto(encoded) + self.assertEqual(structure, decoded) + + def testEncodeDecodeSparseTensorSpec(self): + structure = [sparse_tensor.SparseTensorSpec([10, 20], dtypes.float32)] + self.assertTrue(self._coder.can_encode(structure)) + encoded = self._coder.encode_structure(structure) + expected_pbtxt = r""" + list_value { + values { + type_spec_value { + type_spec_class: SPARSE_TENSOR_SPEC + type_state { + tuple_value { + # spec._shape + values { + tensor_shape_value { + dim { size: 10 } + dim { size: 20 } + } + } + # spec._dtype + values { tensor_dtype_value: DT_FLOAT } + } + } + } + } + } + """ + expected = struct_pb2.StructuredValue() + text_format.Parse(expected_pbtxt, expected) + self.assertEqual(expected, encoded) + decoded = self._coder.decode_proto(encoded) + self.assertEqual(structure, decoded) + + def testEncodeDataSetSpec(self): + structure = [dataset_ops.DatasetSpec( + {"rt": ragged_tensor.RaggedTensorSpec([10, None], dtypes.int32), + "st": sparse_tensor.SparseTensorSpec([10, 20], dtypes.float32), + "t": tensor_spec.TensorSpec([10, 8], dtypes.string)})] + self.assertTrue(self._coder.can_encode(structure)) + encoded = self._coder.encode_structure(structure) + decoded = self._coder.decode_proto(encoded) + self.assertEqual(structure, decoded) + def testNotEncodable(self): class NotEncodable(object): From 0b74dd62a0c4a4914d12fb97b55884b1cfbf6ada Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Wed, 24 Jul 2019 12:39:17 -0700 Subject: [PATCH 0487/3053] Simplify graphdef2mlir/graph-while-loop.pbtxt test to be more targeted The purpose of this test is to verify that we import a NextIteration backedge into a pair of source/sink node in MLIR. The CHECK lines are updated to verify specifically this. PiperOrigin-RevId: 259794648 --- .../graphdef2mlir/graph-while-loop.pbtxt | 27 +++++++------------ 1 file changed, 10 insertions(+), 17 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt index ac84234e4ac..f60fb46affb 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-while-loop.pbtxt @@ -1,5 +1,15 @@ # RUN: tf-mlir-translate -graphdef-to-mlir -mlir-print-debuginfo %s -o - | FileCheck %s +# Verify that importing a Graph with a backedge leads to two NextIteration nodes +# to break the cycle. + +# CHECK-LABEL: func @main() +# CHECK: %[[NEXTITERATION:[0-9]+]]:2 = "_tf.NextIteration.source" +# CHECK: tf.Merge"({{.*}} %[[NEXTITERATION]]#0) + +# CHECK: %[[ADD:[0-9]+]]:2 = "_tf.Add" +# CHECK: "_tf.NextIteration.sink"(%[[ADD]]#0) + node { name: "Const" op: "Const" @@ -203,20 +213,3 @@ versions { producer: 27 } -# CHECK: func @main() { -# CHECK-NEXT: %0:2 = "_tf.NextIteration.source"() {T = "tfdtype$DT_INT32", device = "", name = "while/NextIteration"} : () -> (tensor<*xi32>, !_tf.control) loc("while/NextIteration") -# CHECK-NEXT: %1:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Const", value = dense<0> : tensor} : () -> (tensor, !_tf.control) loc("Const") -# CHECK-NEXT: %2:2 = "_tf.Enter"(%1#0) {T = "tfdtype$DT_INT32", device = "", frame_name = "while/while_context", is_constant = false, name = "while/Enter", parallel_iterations = 10 : i64} : (tensor) -> (tensor<*xi32>, !_tf.control) loc("while/Enter") -# CHECK-NEXT: %3:3 = "_tf.Merge"(%2#0, %0#0) {N = 2 : i64, T = "tfdtype$DT_INT32", device = "", name = "while/Merge"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor, !_tf.control) loc("while/Merge") -# CHECK-NEXT: %4:2 = "_tf.Const"(%3#2) {device = "", dtype = "tfdtype$DT_INT32", name = "while/Less/y", value = dense<10> : tensor} : (!_tf.control) -> (tensor, !_tf.control) loc("while/Less/y") -# CHECK-NEXT: %5:2 = "_tf.Less"(%3#0, %4#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Less"} : (tensor<*xi32>, tensor) -> (tensor<*xi1>, !_tf.control) loc("while/Less") -# CHECK-NEXT: %6:2 = "_tf.LoopCond"(%5#0) {device = "", name = "while/LoopCond"} : (tensor<*xi1>) -> (tensor, !_tf.control) loc("while/LoopCond") -# CHECK-NEXT: %7:3 = "_tf.Switch"(%3#0, %6#0) {T = "tfdtype$DT_INT32", _class = ["loc:@while/Merge"], device = "", name = "while/Switch"} : (tensor<*xi32>, tensor) -> (tensor<*xi32>, tensor<*xi32>, !_tf.control) loc("while/Switch") -# CHECK-NEXT: %8:2 = "_tf.Exit"(%7#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Exit"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) loc("while/Exit") -# CHECK-NEXT: %9:2 = "_tf.Identity"(%7#1) {T = "tfdtype$DT_INT32", device = "", name = "while/Identity"} : (tensor<*xi32>) -> (tensor<*xi32>, !_tf.control) loc("while/Identity") -# CHECK-NEXT: %10:2 = "_tf.Const"(%9#1) {device = "", dtype = "tfdtype$DT_INT32", name = "while/Add/y", value = dense<1> : tensor} : (!_tf.control) -> (tensor, !_tf.control) loc("while/Add/y") -# CHECK-NEXT: %11:2 = "_tf.Add"(%9#0, %10#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Add"} : (tensor<*xi32>, tensor) -> (tensor<*xi32>, !_tf.control) loc("while/Add") -# CHECK-NEXT: %12 = "_tf.NextIteration.sink"(%11#0) {T = "tfdtype$DT_INT32", device = "", name = "while/NextIteration"} : (tensor<*xi32>) -> !_tf.control loc("while/NextIteration") -# CHECK-NEXT: return loc(unknown) -# CHECK-NEXT: } - From 2b62df37e1c293f7513eafe3ceb8ced2299e369a Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Wed, 24 Jul 2019 12:42:09 -0700 Subject: [PATCH 0488/3053] Cache OpDef protobufs in the Graph object. This avoids serializing and deserializing an OpDef protobuf for each created op, and reduces the per-op memory overhead. PiperOrigin-RevId: 259795215 --- tensorflow/python/framework/ops.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index d710e7db0cf..a20cc832232 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -2815,6 +2815,8 @@ class Graph(object): # Set to True if this graph is being built in an # AutomaticControlDependencies context. self._add_control_dependencies = False + # Cache for OpDef protobufs retrieved via the C API. + self._op_def_cache = {} # TODO(skyewm): fold as much of the above as possible into the C # implementation @@ -3715,14 +3717,20 @@ class Graph(object): def _get_op_def(self, type): # pylint: disable=redefined-builtin """Returns the `OpDef` proto for `type`. `type` is a string.""" - with c_api_util.tf_buffer() as buf: - # pylint: disable=protected-access - c_api.TF_GraphGetOpDef(self._c_graph, compat.as_bytes(type), buf) - # pylint: enable=protected-access - data = c_api.TF_GetBuffer(buf) - op_def = op_def_pb2.OpDef() - op_def.ParseFromString(compat.as_bytes(data)) - return op_def + # NOTE: No locking is required because the lookup and insertion operations + # on Python dictionaries are atomic. + try: + return self._op_def_cache[type] + except KeyError: + with c_api_util.tf_buffer() as buf: + # pylint: disable=protected-access + c_api.TF_GraphGetOpDef(self._c_graph, compat.as_bytes(type), buf) + # pylint: enable=protected-access + data = c_api.TF_GetBuffer(buf) + op_def = op_def_pb2.OpDef() + op_def.ParseFromString(compat.as_bytes(data)) + self._op_def_cache[type] = op_def + return op_def def as_default(self): """Returns a context manager that makes this `Graph` the default graph. From d6667327ae07e115d1cebac16ff35cbc64ca675c Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Wed, 24 Jul 2019 12:43:01 -0700 Subject: [PATCH 0489/3053] [tf.data] Implement cancellation support for blocking user-defined functions. PiperOrigin-RevId: 259795364 --- tensorflow/core/common_runtime/data/BUILD | 1 + .../core/common_runtime/data/standalone.cc | 27 +- .../core/common_runtime/data/standalone.h | 2 + tensorflow/core/framework/dataset.h | 9 + .../core/kernels/data/captured_function.cc | 118 ++--- .../core/kernels/data/captured_function.h | 3 + .../core/kernels/data/dataset_test_base.cc | 1 + tensorflow/core/kernels/data/dataset_utils.cc | 18 + tensorflow/core/kernels/data/dataset_utils.h | 7 + .../data/experimental/to_tf_record_op.cc | 124 +++-- tensorflow/core/kernels/data/iterator_ops.cc | 458 ++++++++++-------- tensorflow/core/kernels/data/iterator_ops.h | 19 +- .../kernels/data/multi_device_iterator_ops.cc | 61 ++- .../core/kernels/data/prefetch_dataset_op.cc | 3 +- .../python/data/kernel_tests/map_test.py | 25 + .../python/data/kernel_tests/test_base.py | 2 +- 16 files changed, 521 insertions(+), 357 deletions(-) diff --git a/tensorflow/core/common_runtime/data/BUILD b/tensorflow/core/common_runtime/data/BUILD index 2544cc67af6..190901847a2 100644 --- a/tensorflow/core/common_runtime/data/BUILD +++ b/tensorflow/core/common_runtime/data/BUILD @@ -14,6 +14,7 @@ cc_library( "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:session_options", + "@com_google_absl//absl/memory", ], ) diff --git a/tensorflow/core/common_runtime/data/standalone.cc b/tensorflow/core/common_runtime/data/standalone.cc index eebf00096a0..21becb37ed5 100644 --- a/tensorflow/core/common_runtime/data/standalone.cc +++ b/tensorflow/core/common_runtime/data/standalone.cc @@ -17,6 +17,7 @@ limitations under the License. #include +#include "absl/memory/memory.h" #include "tensorflow/core/common_runtime/device_factory.h" #include "tensorflow/core/common_runtime/device_mgr.h" #include "tensorflow/core/common_runtime/function.h" @@ -45,20 +46,17 @@ Status Dataset::FromGraph(Params params, const GraphDef& graph_def, Graph graph(OpRegistry::Global()); TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr)); - // Instantiate enough of the TensorFlow runtime to run `graph` on a single CPU - // device. - std::unique_ptr device_mgr = - MakeUnique(DeviceFactory::NewDevice( - "CPU", params.session_options, "/job:localhost/replica:0/task:0")); + // Instantiate enough of the TF runtime to run `graph` on a single CPU device. + auto device_mgr = absl::make_unique(DeviceFactory::NewDevice( + "CPU", params.session_options, "/job:localhost/replica:0/task:0")); Device* device = device_mgr->ListDevices()[0]; // Clone the `FunctionLibraryDefinition` to extend its lifetime extends beyond // the lifetime of `graph`. - std::unique_ptr flib_def = - MakeUnique(graph.flib_def()); - std::unique_ptr pflr = - MakeUnique( - device_mgr.get(), Env::Default(), TF_GRAPH_DEF_VERSION, - flib_def.get(), OptimizerOptions{}, nullptr /* parent */); + auto flib_def = + absl::make_unique(graph.flib_def()); + auto pflr = absl::make_unique( + device_mgr.get(), Env::Default(), TF_GRAPH_DEF_VERSION, flib_def.get(), + OptimizerOptions{}, nullptr /* parent */); string fetch_node = ""; for (auto node : graph_def.node()) { @@ -107,7 +105,10 @@ Status Dataset::MakeIterator(std::unique_ptr* result) { OpKernelContext op_ctx(&op_params, 0); IteratorContext::Params params(&op_ctx); params.function_handle_cache = function_handle_cache_.get(); - ctx = MakeUnique(std::move(params)); + params.resource_mgr = &resource_mgr_; + params.cancellation_manager = &cancellation_manager_; + + ctx = absl::make_unique(std::move(params)); } // Create the iterator from the dataset. @@ -129,7 +130,7 @@ Dataset::Dataset(DatasetBase* dataset, DeviceMgr* device_mgr, pool_(pool) { runner_ = [this](std::function c) { pool_->Schedule(std::move(c)); }; function_handle_cache_ = - MakeUnique(pflr_->GetFLR("/device:CPU:0")); + absl::make_unique(pflr_->GetFLR("/device:CPU:0")); } Dataset::~Dataset() { dataset_->Unref(); } diff --git a/tensorflow/core/common_runtime/data/standalone.h b/tensorflow/core/common_runtime/data/standalone.h index 7ec420ab8ac..70a6820c63f 100644 --- a/tensorflow/core/common_runtime/data/standalone.h +++ b/tensorflow/core/common_runtime/data/standalone.h @@ -111,6 +111,8 @@ class Dataset { std::unique_ptr pool_; std::unique_ptr function_handle_cache_; std::function)> runner_; + ResourceMgr resource_mgr_; + CancellationManager cancellation_manager_; }; } // namespace standalone diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h index 712865ee68f..abca3534cd7 100644 --- a/tensorflow/core/framework/dataset.h +++ b/tensorflow/core/framework/dataset.h @@ -22,6 +22,7 @@ limitations under the License. #include "absl/memory/memory.h" #include "tensorflow/core/framework/attr_value.pb.h" #include "tensorflow/core/framework/attr_value_util.h" +#include "tensorflow/core/framework/cancellation.h" #include "tensorflow/core/framework/dataset_stateful_op_whitelist.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/graph.pb.h" @@ -298,6 +299,7 @@ class IteratorContext { struct Params { explicit Params(IteratorContext* ctx) : allocator_getter(ctx->allocator_getter()), + cancellation_manager(ctx->cancellation_manager()), env(ctx->env()), flr(ctx->flr()), function_handle_cache(ctx->function_handle_cache()), @@ -343,6 +345,9 @@ class IteratorContext { // The Allocator to be used to allocate the output of an iterator. std::function allocator_getter = nullptr; + // The CancellationManager to be used to cancel execution of ops. + CancellationManager* cancellation_manager; + // Interface to operating system functionality. Env* env = nullptr; @@ -387,6 +392,10 @@ class IteratorContext { return params_.allocator_getter; } + CancellationManager* cancellation_manager() { + return params_.cancellation_manager; + } + Env* env() const { return params_.env; } FunctionLibraryRuntime* flr() { return params_.flr; } diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc index 26290166c1e..89656b9abfb 100644 --- a/tensorflow/core/kernels/data/captured_function.cc +++ b/tensorflow/core/kernels/data/captured_function.cc @@ -401,7 +401,8 @@ Status CapturedFunction::Instantiate( *instantiated_captured_function = absl::WrapUnique( new InstantiatedCapturedFunction(lib, f_handle, std::move(ret_types), - *ctx->runner(), this)); + *ctx->runner(), + ctx->cancellation_manager(), this)); return Status::OK(); } @@ -522,11 +523,12 @@ class BorrowedArgsCallFrame : public CallFrameBase { InstantiatedCapturedFunction::InstantiatedCapturedFunction( FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle, DataTypeVector ret_types, std::function)> runner, - CapturedFunction* captured_func) + CancellationManager* cancellation_manager, CapturedFunction* captured_func) : lib_(lib), f_handle_(f_handle), ret_types_(std::move(ret_types)), captured_runner_(std::move(runner)), + cancellation_manager_(cancellation_manager), captured_func_(captured_func) {} // NOTE: We don't release f_handle_ here and instead delegate the function @@ -552,14 +554,12 @@ Status InstantiatedCapturedFunction::Run(IteratorContext* ctx, f_opts.step_container = &step_container; f_opts.runner = ctx->runner(); f_opts.create_rendezvous = ShouldCreateRendezvous(); - // TODO(mrry): Add cancellation manager support to IteratorContext - // so that we can cancel running map functions. The local - // cancellation manager here is created so that we can run kernels - // (such as queue kernels) that depend on the non-nullness of - // `OpKernelContext::cancellation_manager()`, but additional effort - // will be required to plumb it through the `IteratorContext`. - CancellationManager c_mgr; - f_opts.cancellation_manager = &c_mgr; + CancellationManager cancellation_manager; + f_opts.cancellation_manager = &cancellation_manager; + std::function deregister_fn; + TF_RETURN_IF_ERROR(ConnectCancellationManagers( + cancellation_manager_, &cancellation_manager, &deregister_fn)); + auto cleanup = gtl::MakeCleanup(std::move(deregister_fn)); OwnedArgsCallFrame frame(std::move(args), &captured_func_->captured_inputs(), ret_types_); @@ -590,14 +590,12 @@ Status InstantiatedCapturedFunction::RunWithBorrowedArgs( f_opts.step_container = &step_container; f_opts.runner = ctx->runner(); f_opts.create_rendezvous = ShouldCreateRendezvous(); - // TODO(mrry): Add cancellation manager support to IteratorContext - // so that we can cancel running map functions. The local - // cancellation manager here is created so that we can run kernels - // (such as queue kernels) that depend on the non-nullness of - // `OpKernelContext::cancellation_manager()`, but additional effort - // will be required to plumb it through the `IteratorContext`. - CancellationManager c_mgr; - f_opts.cancellation_manager = &c_mgr; + CancellationManager cancellation_manager; + f_opts.cancellation_manager = &cancellation_manager; + std::function deregister_fn; + TF_RETURN_IF_ERROR(ConnectCancellationManagers( + cancellation_manager_, &cancellation_manager, &deregister_fn)); + auto cleanup = gtl::MakeCleanup(std::move(deregister_fn)); BorrowedArgsCallFrame frame(args, &captured_func_->captured_inputs(), ret_types_); @@ -628,14 +626,12 @@ Status InstantiatedCapturedFunction::RunInstantiated( f_opts.step_container = &step_container; f_opts.runner = &captured_runner_; f_opts.create_rendezvous = ShouldCreateRendezvous(); - // TODO(mrry): Add cancellation manager support to IteratorContext - // so that we can cancel running map functions. The local - // cancellation manager here is created so that we can run kernels - // (such as queue kernels) that depend on the non-nullness of - // `OpKernelContext::cancellation_manager()`, but additional effort - // will be required to plumb it through the `IteratorContext`. - CancellationManager c_mgr; - f_opts.cancellation_manager = &c_mgr; + CancellationManager cancellation_manager; + f_opts.cancellation_manager = &cancellation_manager; + std::function deregister_fn; + TF_RETURN_IF_ERROR(ConnectCancellationManagers( + cancellation_manager_, &cancellation_manager, &deregister_fn)); + auto cleanup = gtl::MakeCleanup(std::move(deregister_fn)); BorrowedArgsCallFrame frame(args, &captured_func_->captured_inputs(), ret_types_); @@ -681,59 +677,65 @@ void InstantiatedCapturedFunction::RunAsync( f_opts.step_container = step_container; f_opts.runner = ctx->runner(); f_opts.create_rendezvous = ShouldCreateRendezvous(); - // TODO(mrry): Add cancellation manager support to IteratorContext - // so that we can cancel running map functions. The local - // cancellation manager here is created so that we can run kernels - // (such as queue kernels) that depend on the non-nullness of - // `OpKernelContext::cancellation_manager()`, but additional effort - // will be required to plumb it through the `IteratorContext`. - CancellationManager* c_mgr = new CancellationManager(); - f_opts.cancellation_manager = c_mgr; + auto cancellation_manager = absl::make_unique(); + f_opts.cancellation_manager = cancellation_manager.get(); + std::function deregister_fn; + Status s = ConnectCancellationManagers( + ctx->cancellation_manager(), cancellation_manager.get(), &deregister_fn); + if (!s.ok()) { + done(s); + return; + } + std::shared_ptr stats_collector; if (ctx->model() || ctx->stats_aggregator()) { stats_collector = absl::make_unique(); } f_opts.stats_collector = stats_collector.get(); + // Transfer ownership of the cancellation manager to `callback`. + CancellationManager* raw_cancellation_manager = + cancellation_manager.release(); auto callback = std::bind( - [this, rets, step_container, c_mgr, frame]( + [this, rets, step_container, raw_cancellation_manager, frame]( const FunctionLibraryRuntime::DoneCallback& done, - const std::shared_ptr& model, - const std::shared_ptr& stats_aggregator, + IteratorContext* ctx, const std::function& deregister_fn, const string& prefix, const std::shared_ptr& stats_collector, // Begin unbound arguments. Status s) { delete step_container; - delete c_mgr; + deregister_fn(); + delete raw_cancellation_manager; if (s.ok()) { s = frame->ConsumeRetvals(rets); } delete frame; - // TODO(b/129085499) Utilize the `node_name` which would be unique than - // the prefix for the function execution time statistics. - // prefix_with_func_name would then be node_name + func_name. - if (stats_aggregator) { - string prefix_end = - str_util::Split(prefix, "::", str_util::SkipEmpty()).back(); - string prefix_with_func_name = - strings::StrCat(prefix_end, stats_utils::kDelimiter, - captured_func_->func().name()); - stats_aggregator->AddToHistogram( - stats_utils::ExecutionTimeHistogramName(prefix_with_func_name), - {static_cast(stats_collector->processing_time())}, - model->NumElements(prefix)); - } - if (model) { - model->AddProcessingTime(prefix, stats_collector->processing_time()); - model->RecordStart(prefix, false /* stop_output */); + if (ctx->model()) { + // TODO(b/129085499) Utilize the `node_name` which would be unique + // than the prefix for the function execution time statistics. + // prefix_with_func_name would then be node_name + func_name. + if (ctx->stats_aggregator()) { + string prefix_end = + str_util::Split(prefix, "::", str_util::SkipEmpty()).back(); + string prefix_with_func_name = + strings::StrCat(prefix_end, stats_utils::kDelimiter, + captured_func_->func().name()); + ctx->stats_aggregator()->AddToHistogram( + stats_utils::ExecutionTimeHistogramName(prefix_with_func_name), + {static_cast(stats_collector->processing_time())}, + ctx->model()->NumElements(prefix)); + } + ctx->model()->AddProcessingTime(prefix, + stats_collector->processing_time()); + ctx->model()->RecordStart(prefix, false /* stop_output */); } done(s); - if (model) { - model->RecordStop(prefix, false /* start_output */); + if (ctx->model()) { + ctx->model()->RecordStop(prefix, false /* start_output */); } }, - std::move(done), ctx->model(), ctx->stats_aggregator(), prefix, + std::move(done), ctx, std::move(deregister_fn), prefix, std::move(stats_collector), std::placeholders::_1); lib_->Run(f_opts, f_handle_, frame, std::move(callback)); diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h index 258fe172004..b020f530eda 100644 --- a/tensorflow/core/kernels/data/captured_function.h +++ b/tensorflow/core/kernels/data/captured_function.h @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include "tensorflow/core/framework/cancellation.h" #include "tensorflow/core/framework/dataset.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/op_kernel.h" @@ -93,6 +94,7 @@ class InstantiatedCapturedFunction { FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle, DataTypeVector ret_types, std::function)> runner, + CancellationManager* cancellation_manager, CapturedFunction* captured_func); // Determines whether a rendezvous object should be created when running the @@ -105,6 +107,7 @@ class InstantiatedCapturedFunction { const FunctionLibraryRuntime::Handle f_handle_; const DataTypeVector ret_types_; std::function)> captured_runner_; + CancellationManager* cancellation_manager_; CapturedFunction* const captured_func_; TF_DISALLOW_COPY_AND_ASSIGN(InstantiatedCapturedFunction); diff --git a/tensorflow/core/kernels/data/dataset_test_base.cc b/tensorflow/core/kernels/data/dataset_test_base.cc index 2a5f03edf16..2854bfdc9b5 100644 --- a/tensorflow/core/kernels/data/dataset_test_base.cc +++ b/tensorflow/core/kernels/data/dataset_test_base.cc @@ -350,6 +350,7 @@ Status DatasetOpsTestBase::CreateIteratorContext( params.resource_mgr = op_context->resource_manager(); function_handle_cache_ = absl::make_unique(flr_); params.function_handle_cache = function_handle_cache_.get(); + params.cancellation_manager = cancellation_manager_.get(); *iterator_context = absl::make_unique(params); return Status::OK(); } diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc index 53128e86b3e..5c81cb6ab3e 100644 --- a/tensorflow/core/kernels/data/dataset_utils.cc +++ b/tensorflow/core/kernels/data/dataset_utils.cc @@ -30,6 +30,7 @@ limitations under the License. #include "tensorflow/core/grappler/optimizers/data/function_utils.h" #include "tensorflow/core/grappler/optimizers/data/graph_utils.h" #include "tensorflow/core/grappler/optimizers/meta_optimizer.h" +#include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/cleanup.h" #include "tensorflow/core/lib/hash/hash.h" #include "tensorflow/core/lib/strings/proto_serialization.h" @@ -156,6 +157,23 @@ Status AsGraphDef(OpKernelContext* ctx, const DatasetBase* dataset, return Status::OK(); } +Status ConnectCancellationManagers(CancellationManager* parent, + CancellationManager* child, + std::function* deregister_fn) { + if (parent) { + CancellationToken token = parent->get_cancellation_token(); + if (!parent->RegisterCallback(token, [child]() { child->StartCancel(); })) { + return errors::Cancelled("Operation was cancelled"); + } + *deregister_fn = [parent, token]() { parent->DeregisterCallback(token); }; + } else { + VLOG(1) << "Parent cancellation manager is not set. Cancellation will " + "not be propagated to the child cancellation manager."; + *deregister_fn = []() {}; + } + return Status::OK(); +} + Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input, std::function config_factory, bool optimize_function_library, diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h index 063f524e7ad..fbf7f8e22c7 100644 --- a/tensorflow/core/kernels/data/dataset_utils.h +++ b/tensorflow/core/kernels/data/dataset_utils.h @@ -27,6 +27,13 @@ Status AsGraphDef(OpKernelContext* ctx, const DatasetBase* dataset, SerializationContext&& serialization_ctx, GraphDef* graph_def); +// Creates a connection between "child" and "parent" cancellation managers so +// that parent cancellations are propagated to the child, returning a function +// that can be used to remove the connection. +Status ConnectCancellationManagers(CancellationManager* parent, + CancellationManager* child, + std::function* deregister_fn); + // Rewrites the input dataset using the given config. Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input, std::function config_factory, diff --git a/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc index 9af8304735a..1cc3bc0f330 100644 --- a/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc +++ b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc @@ -47,56 +47,88 @@ class ToTFRecordOp : public AsyncOpKernel { void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override { // The call to `iterator->GetNext()` may block and depend on an inter-op // thread pool thread, so we issue the call using a background thread. - background_worker_.Schedule([this, ctx, done]() { - string filename; - OP_REQUIRES_OK_ASYNC( - ctx, ParseScalarArgument(ctx, "filename", &filename), done); - string compression_type; - OP_REQUIRES_OK_ASYNC(ctx, - ParseScalarArgument(ctx, "compression_type", - &compression_type), - done); - std::unique_ptr file; - OP_REQUIRES_OK_ASYNC(ctx, ctx->env()->NewWritableFile(filename, &file), - done); - std::unique_ptr writer = - absl::make_unique( + background_worker_.Schedule(std::bind( + [this, ctx](std::function& done) { + string filename; + OP_REQUIRES_OK_ASYNC( + ctx, ParseScalarArgument(ctx, "filename", &filename), + done); + string compression_type; + OP_REQUIRES_OK_ASYNC(ctx, + ParseScalarArgument( + ctx, "compression_type", &compression_type), + done); + std::unique_ptr file; + OP_REQUIRES_OK_ASYNC( + ctx, ctx->env()->NewWritableFile(filename, &file), done); + auto writer = absl::make_unique( file.get(), io::RecordWriterOptions::CreateRecordWriterOptions( compression_type)); - DatasetBase* dataset; - OP_REQUIRES_OK_ASYNC( - ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done); - std::unique_ptr iterator; - IteratorContext::Params params(ctx); - std::unique_ptr function_handle_cache = - absl::make_unique(params.flr); - params.function_handle_cache = function_handle_cache.get(); - auto resource_mgr = absl::make_unique(); - params.resource_mgr = resource_mgr.get(); - IteratorContext iter_ctx(std::move(params)); - - OP_REQUIRES_OK_ASYNC( - ctx, - dataset->MakeIterator(&iter_ctx, "ToTFRecordOpIterator", &iterator), - done); - - std::vector components; - components.reserve(dataset->output_dtypes().size()); - bool end_of_sequence; - do { - OP_REQUIRES_OK_ASYNC( - ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence), - done); - - if (!end_of_sequence) { + DatasetBase* dataset; OP_REQUIRES_OK_ASYNC( - ctx, writer->WriteRecord(components[0].scalar()()), done); - } - components.clear(); - } while (!end_of_sequence); - done(); - }); + ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done); + + IteratorContext::Params params(ctx); + FunctionHandleCache function_handle_cache(params.flr); + params.function_handle_cache = &function_handle_cache; + ResourceMgr resource_mgr; + params.resource_mgr = &resource_mgr; + CancellationManager cancellation_manager; + params.cancellation_manager = &cancellation_manager; + std::function deregister_fn; + OP_REQUIRES_OK_ASYNC(ctx, + ConnectCancellationManagers( + ctx->cancellation_manager(), + params.cancellation_manager, &deregister_fn), + done); + + // Update the `done` callback to deregister the cancellation callback. + done = std::bind( + [](const std::function& done, + const std::function& deregister_fn) { + deregister_fn(); + done(); + }, + std::move(done), std::move(deregister_fn)); + + IteratorContext iter_ctx(std::move(params)); + std::unique_ptr iterator; + OP_REQUIRES_OK_ASYNC( + ctx, + dataset->MakeIterator(&iter_ctx, "ToTFRecordOpIterator", + &iterator), + done); + + // Update the `done` callback to destroy the iterator before calling + // the actual callback to avoid destruction races. + IteratorBase* raw_iterator = iterator.release(); + done = std::bind( + [raw_iterator](const std::function& done) { + delete raw_iterator; + done(); + }, + std::move(done)); + + std::vector components; + components.reserve(dataset->output_dtypes().size()); + bool end_of_sequence; + do { + OP_REQUIRES_OK_ASYNC( + ctx, + raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence), + done); + + if (!end_of_sequence) { + OP_REQUIRES_OK_ASYNC( + ctx, writer->WriteRecord(components[0].scalar()()), + done); + } + components.clear(); + } while (!end_of_sequence); + done(); + }, + std::move(done))); } private: diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc index 5ae1c155cca..64b7f7c70fc 100644 --- a/tensorflow/core/kernels/data/iterator_ops.cc +++ b/tensorflow/core/kernels/data/iterator_ops.cc @@ -21,7 +21,9 @@ limitations under the License. #include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h" #include "tensorflow/core/common_runtime/renamed_device.h" #include "tensorflow/core/common_runtime/threadpool_device.h" +#include "tensorflow/core/framework/cancellation.h" #include "tensorflow/core/framework/function.h" +#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/partial_tensor_shape.h" #include "tensorflow/core/framework/resource_op_kernel.h" #include "tensorflow/core/framework/stats_aggregator.h" @@ -54,7 +56,7 @@ const char kIteratorVariantTypeName[] = "tensorflow::Iterator"; } // namespace -Status IteratorResource::GetNext(IteratorContext* ctx, +Status IteratorResource::GetNext(OpKernelContext* ctx, std::vector* out_tensors, bool* end_of_sequence) { std::shared_ptr captured_state; @@ -68,6 +70,12 @@ Status IteratorResource::GetNext(IteratorContext* ctx, params.function_handle_cache = captured_state->function_handle_cache.get(); params.resource_mgr = &captured_state->resource_mgr; params.thread_factory = unbounded_thread_pool_.get_thread_factory(); + params.cancellation_manager = &captured_state->cancellation_manager; + std::function deregister_fn; + TF_RETURN_IF_ERROR(ConnectCancellationManagers(ctx->cancellation_manager(), + params.cancellation_manager, + &deregister_fn)); + auto cleanup = gtl::MakeCleanup(std::move(deregister_fn)); return captured_state->iterator->GetNext(IteratorContext(std::move(params)), out_tensors, end_of_sequence); } else { @@ -78,12 +86,6 @@ Status IteratorResource::GetNext(IteratorContext* ctx, } } -Status IteratorResource::GetNext(IteratorContext&& ctx, - std::vector* out_tensors, - bool* end_of_sequence) { - return GetNext(&ctx, out_tensors, end_of_sequence); -} - Status IteratorResource::Save(SerializationContext* ctx, IteratorStateWriter* writer) { std::shared_ptr captured_state; @@ -137,7 +139,7 @@ Status IteratorResource::Restore(OpKernelContext* ctx, // serialized function when there is a conflict. TF_RETURN_IF_ERROR(AddToFunctionLibrary(flib_def.get(), graph_def.library())); auto new_state = absl::make_unique( - std::move(flib_def), std::move(pflr), flr, nullptr /* iterator */); + std::move(flib_def), std::move(pflr), flr, /*iterator=*/nullptr); TF_RETURN_IF_ERROR( graph_runner.Run(&graph, new_state->flr, {}, {output_node}, &outputs)); @@ -147,28 +149,26 @@ Status IteratorResource::Restore(OpKernelContext* ctx, params.flr = new_state->flr; params.function_handle_cache = new_state->function_handle_cache.get(); params.resource_mgr = &new_state->resource_mgr; + DeviceBase* device = new_state->flr->device(); + params.allocator_getter = [device](AllocatorAttributes attrs) { + return device->GetAllocator(attrs); + }; params.thread_factory = unbounded_thread_pool_.get_thread_factory(); + params.cancellation_manager = &new_state->cancellation_manager; + std::function deregister_fn; + TF_RETURN_IF_ERROR(ConnectCancellationManagers(ctx->cancellation_manager(), + params.cancellation_manager, + &deregister_fn)); + auto cleanup = gtl::MakeCleanup(std::move(deregister_fn)); + IteratorContext iter_ctx(std::move(params)); - TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)), - "Iterator", &new_state->iterator)); + TF_RETURN_IF_ERROR( + dataset->MakeIterator(&iter_ctx, "Iterator", &new_state->iterator)); TF_RETURN_IF_ERROR( VerifyTypesMatch(output_dtypes_, new_state->iterator->output_dtypes())); TF_RETURN_IF_ERROR(VerifyShapesCompatible( output_shapes_, new_state->iterator->output_shapes())); - - { - IteratorContext::Params params(ctx); - params.flr = new_state->flr; - params.function_handle_cache = new_state->function_handle_cache.get(); - params.resource_mgr = &new_state->resource_mgr; - DeviceBase* device = new_state->flr->device(); - params.allocator_getter = [device](AllocatorAttributes attrs) { - return device->GetAllocator(attrs); - }; - params.thread_factory = unbounded_thread_pool_.get_thread_factory(); - IteratorContext iter_ctx(std::move(params)); - TF_RETURN_IF_ERROR(new_state->iterator->Restore(&iter_ctx, reader)); - } + TF_RETURN_IF_ERROR(new_state->iterator->Restore(&iter_ctx, reader)); mutex_lock l(mu_); iterator_state_ = std::move(new_state); @@ -182,10 +182,8 @@ Status IteratorResource::SetIteratorFromDataset(OpKernelContext* ctx, tf_shared_lock l(mu_); new_state = std::make_shared( iterator_state_->flib_def, iterator_state_->pflr, iterator_state_->flr, - nullptr /* function_handle_cache */, nullptr /* iterator */); + /*iterator=*/nullptr); } - new_state->function_handle_cache = - absl::make_unique(new_state->flr); // Create new iterator. std::unique_ptr iterator; IteratorContext::Params params(ctx); @@ -193,13 +191,21 @@ Status IteratorResource::SetIteratorFromDataset(OpKernelContext* ctx, params.function_handle_cache = new_state->function_handle_cache.get(); params.resource_mgr = &new_state->resource_mgr; params.thread_factory = unbounded_thread_pool_.get_thread_factory(); - TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)), - "Iterator", &iterator)); - TF_RETURN_IF_ERROR( - VerifyTypesMatch(output_dtypes_, iterator->output_dtypes())); - TF_RETURN_IF_ERROR( - VerifyShapesCompatible(output_shapes_, iterator->output_shapes())); - std::swap(new_state->iterator, iterator); + params.cancellation_manager = &new_state->cancellation_manager; + std::function deregister_fn; + TF_RETURN_IF_ERROR(ConnectCancellationManagers(ctx->cancellation_manager(), + params.cancellation_manager, + &deregister_fn)); + { + auto cleanup = gtl::MakeCleanup(std::move(deregister_fn)); + TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)), + "Iterator", &iterator)); + TF_RETURN_IF_ERROR( + VerifyTypesMatch(output_dtypes_, iterator->output_dtypes())); + TF_RETURN_IF_ERROR( + VerifyShapesCompatible(output_shapes_, iterator->output_shapes())); + std::swap(new_state->iterator, iterator); + } mutex_lock l(mu_); std::swap(iterator_state_, new_state); @@ -477,64 +483,91 @@ class ToSingleElementOp : public AsyncOpKernel { // The call to `iterator->GetNext()` may block and depend on an // inter-op thread pool thread, so we issue the call from the // owned thread pool. - background_worker_.Schedule([ctx, done]() { - DatasetBase* dataset; - OP_REQUIRES_OK_ASYNC( - ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done); - std::unique_ptr iterator; - IteratorContext::Params params(ctx); - std::unique_ptr function_handle_cache = - absl::make_unique(params.flr); - params.function_handle_cache = function_handle_cache.get(); - std::unique_ptr resource_mgr = - absl::make_unique(); - params.resource_mgr = resource_mgr.get(); - IteratorContext iter_ctx(std::move(params)); + background_worker_.Schedule(std::bind( + [ctx](std::function& done) { + DatasetBase* dataset; + OP_REQUIRES_OK_ASYNC( + ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done); - OP_REQUIRES_OK_ASYNC( - ctx, - dataset->MakeIterator(&iter_ctx, "SingleElementIterator", &iterator), - done); + IteratorContext::Params params(ctx); + FunctionHandleCache function_handle_cache(params.flr); + params.function_handle_cache = &function_handle_cache; + ResourceMgr resource_mgr; + params.resource_mgr = &resource_mgr; + CancellationManager cancellation_manager; + params.cancellation_manager = &cancellation_manager; + std::function deregister_fn; + OP_REQUIRES_OK_ASYNC(ctx, + ConnectCancellationManagers( + ctx->cancellation_manager(), + params.cancellation_manager, &deregister_fn), + done); - // NOTE(jsimsa): We must destroy the iterator before calling `done()`, to - // avoid destruction races. - IteratorBase* raw_iterator = iterator.release(); - auto cleanup = gtl::MakeCleanup([raw_iterator, done] { - delete raw_iterator; - done(); - }); - std::vector components; - components.reserve(dataset->output_dtypes().size()); - bool end_of_sequence = false; + // Update the `done` callback to deregister the cancellation callback. + done = std::bind( + [](const std::function& done, + const std::function& deregister_fn) { + deregister_fn(); + done(); + }, + std::move(done), std::move(deregister_fn)); - Status s = - raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence); - if (!s.ok()) { - ctx->SetStatus(s); - return; - } - if (end_of_sequence) { - ctx->SetStatus(errors::InvalidArgument("Dataset was empty.")); - return; - } - for (int i = 0; i < components.size(); ++i) { - // TODO(mrry): Check that the shapes match the shape attrs. - ctx->set_output(i, components[i]); - } + IteratorContext iter_ctx(std::move(params)); + std::unique_ptr iterator; + OP_REQUIRES_OK_ASYNC( + ctx, + dataset->MakeIterator(&iter_ctx, "SingleElementIterator", + &iterator), + done); - components.clear(); - Status s2 = - raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence); - if (!s2.ok()) { - ctx->SetStatus(s2); - return; - } - if (!end_of_sequence) { - ctx->SetStatus( - errors::InvalidArgument("Dataset had more than one element.")); - return; - } - }); + // Update the `done` callback to destroy the iterator before calling + // the actual callback to avoid destruction races. + IteratorBase* raw_iterator = iterator.release(); + done = std::bind( + [raw_iterator](const std::function& done) { + delete raw_iterator; + done(); + }, + std::move(done)); + + std::vector components; + components.reserve(dataset->output_dtypes().size()); + bool end_of_sequence = false; + + Status s = + raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence); + if (!s.ok()) { + ctx->SetStatus(s); + done(); + return; + } + if (end_of_sequence) { + ctx->SetStatus(errors::InvalidArgument("Dataset was empty.")); + done(); + return; + } + for (int i = 0; i < components.size(); ++i) { + // TODO(mrry): Check that the shapes match the shape attrs. + ctx->set_output(i, components[i]); + } + + components.clear(); + s.Update( + raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence)); + if (!s.ok()) { + ctx->SetStatus(s); + done(); + return; + } + if (!end_of_sequence) { + ctx->SetStatus( + errors::InvalidArgument("Dataset had more than one element.")); + done(); + return; + } + done(); + }, + std::move(done))); } private: @@ -560,121 +593,149 @@ class ReduceDatasetOp : public AsyncOpKernel { // The call to `iterator->GetNext()` may block and depend on an // inter-op thread pool thread, so we issue the call from the // owned thread pool. - background_worker_.Schedule([this, ctx, done]() { - DatasetBase* dataset; - OP_REQUIRES_OK_ASYNC( - ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done); - OpInputList inputs; - OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("initial_state", &inputs), - done); - std::vector state(inputs.begin(), inputs.end()); + background_worker_.Schedule(std::bind( + [this, ctx](std::function& done) { + DatasetBase* dataset; + OP_REQUIRES_OK_ASYNC( + ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done); + OpInputList inputs; + OP_REQUIRES_OK_ASYNC(ctx, ctx->input_list("initial_state", &inputs), + done); + std::vector state(inputs.begin(), inputs.end()); - std::unique_ptr captured_func; - OP_REQUIRES_OK_ASYNC( - ctx, - CapturedFunction::Create(ctx, func_metadata_, "other_arguments", - &captured_func), - done); + std::unique_ptr captured_func; + OP_REQUIRES_OK_ASYNC( + ctx, + CapturedFunction::Create(ctx, func_metadata_, "other_arguments", + &captured_func), + done); - IteratorContext::Params params(ctx); - std::unique_ptr function_handle_cache = - absl::make_unique(params.flr); - params.function_handle_cache = function_handle_cache.get(); - std::unique_ptr resource_mgr = - absl::make_unique(); - params.resource_mgr = resource_mgr.get(); - IteratorContext iter_ctx(std::move(params)); - std::unique_ptr instantiated_captured_func; - OP_REQUIRES_OK_ASYNC( - ctx, - captured_func->Instantiate(&iter_ctx, &instantiated_captured_func), - done); + IteratorContext::Params params(ctx); + auto function_handle_cache = + absl::make_unique(params.flr); + params.function_handle_cache = function_handle_cache.get(); + ResourceMgr resource_mgr; + params.resource_mgr = &resource_mgr; + CancellationManager cancellation_manager; + params.cancellation_manager = &cancellation_manager; + std::function deregister_fn; + OP_REQUIRES_OK_ASYNC(ctx, + ConnectCancellationManagers( + ctx->cancellation_manager(), + params.cancellation_manager, &deregister_fn), + done); - std::unique_ptr iterator; - OP_REQUIRES_OK_ASYNC( - ctx, dataset->MakeIterator(&iter_ctx, "ReduceIterator", &iterator), - done); + // Update the `done` callback to deregister the cancellation callback. + done = std::bind( + [](const std::function& done, + const std::function& deregister_fn) { + deregister_fn(); + done(); + }, + std::move(done), std::move(deregister_fn)); - // NOTE(jsimsa): We must destroy the iterator before calling `done()`, to - // avoid destruction races. - IteratorBase* raw_iterator = iterator.release(); - auto cleanup = gtl::MakeCleanup([raw_iterator, done] { - delete raw_iterator; - done(); - }); - auto done = []() {}; + IteratorContext iter_ctx(std::move(params)); + std::unique_ptr + instantiated_captured_func; + OP_REQUIRES_OK_ASYNC(ctx, + captured_func->Instantiate( + &iter_ctx, &instantiated_captured_func), + done); - // Iterate through the input dataset. - Status status; - while (true) { - OP_REQUIRES_ASYNC(ctx, !ctx->cancellation_manager()->IsCancelled(), - errors::Cancelled("Operation was cancelled"), done); - std::vector next_input_element; - bool end_of_input; - status = raw_iterator->GetNext(&iter_ctx, &next_input_element, - &end_of_input); - if (!status.ok() || end_of_input) { - break; - } + std::unique_ptr iterator; + OP_REQUIRES_OK_ASYNC( + ctx, + dataset->MakeIterator(&iter_ctx, "ReduceIterator", &iterator), + done); - // Run the reduce function to update the current state. - std::vector args; - args.reserve(state.size() + next_input_element.size()); - std::copy(state.begin(), state.end(), std::back_inserter(args)); - std::copy(next_input_element.begin(), next_input_element.end(), - std::back_inserter(args)); + // Update the `done` callback to destroy the iterator before calling + // the actual callback to avoid destruction races. + IteratorBase* raw_iterator = iterator.release(); + done = std::bind( + [raw_iterator](const std::function& done) { + delete raw_iterator; + done(); + }, + std::move(done)); - std::vector reduce_func_output; - status = instantiated_captured_func->Run(&iter_ctx, std::move(args), - &reduce_func_output); - if (!status.ok()) { - break; - } - OP_REQUIRES_ASYNC( - ctx, reduce_func_output.size() == state.size(), - errors::InvalidArgument( - "The number of components of the initial state and the reduce " - "function output does not match. (initial_state=", - state.size(), ", output=", reduce_func_output.size(), ")."), - done); - std::swap(reduce_func_output, state); - } + // Iterate through the input dataset. + Status status; + while (true) { + OP_REQUIRES_ASYNC(ctx, !ctx->cancellation_manager()->IsCancelled(), + errors::Cancelled("Operation was cancelled"), + done); + std::vector next_input_element; + bool end_of_input; + status = raw_iterator->GetNext(&iter_ctx, &next_input_element, + &end_of_input); + if (!status.ok() || end_of_input) { + break; + } - if (!status.ok()) { - ctx->SetStatus(status); - return; - } + // Run the reduce function to update the current state. + std::vector args; + args.reserve(state.size() + next_input_element.size()); + std::copy(state.begin(), state.end(), std::back_inserter(args)); + std::copy(next_input_element.begin(), next_input_element.end(), + std::back_inserter(args)); - OP_REQUIRES_ASYNC(ctx, state.size() == output_types_.size(), - errors::InvalidArgument( - "The number of result elements does not match " - "the size of output types: ", - state.size(), " vs. ", output_types_.size()), - done); - OP_REQUIRES_ASYNC(ctx, state.size() == output_shapes_.size(), - errors::InvalidArgument( - "The number of result elements does not match " - "the size of output shapes: ", - state.size(), " vs. ", output_shapes_.size()), - done); - for (int i = 0; i < state.size(); ++i) { - OP_REQUIRES_ASYNC( - ctx, state[i].dtype() == output_types_[i], - errors::InvalidArgument( - "The result does not match the expected type for component ", i, - ". Expected: ", DataTypeString(output_types_[i]), - ". Actual: ", DataTypeString(state[i].dtype()), "."), - done); - OP_REQUIRES_ASYNC( - ctx, output_shapes_[i].IsCompatibleWith(state[i].shape()), - errors::InvalidArgument( - "The result does not match the expected shape for component ", - i, ". Expected: ", output_shapes_[i].DebugString(), - ". Actual: ", state[i].shape().DebugString(), "."), - done); - ctx->set_output(i, state[i]); - } - }); + std::vector reduce_func_output; + status = instantiated_captured_func->Run(&iter_ctx, std::move(args), + &reduce_func_output); + if (!status.ok()) { + break; + } + OP_REQUIRES_ASYNC( + ctx, reduce_func_output.size() == state.size(), + errors::InvalidArgument( + "The number of components of the initial state and the " + "reduce " + "function output does not match. (initial_state=", + state.size(), ", output=", reduce_func_output.size(), ")."), + done); + std::swap(reduce_func_output, state); + } + + if (!status.ok()) { + ctx->SetStatus(status); + done(); + return; + } + + OP_REQUIRES_ASYNC(ctx, state.size() == output_types_.size(), + errors::InvalidArgument( + "The number of result elements does not match " + "the size of output types: ", + state.size(), " vs. ", output_types_.size()), + done); + OP_REQUIRES_ASYNC(ctx, state.size() == output_shapes_.size(), + errors::InvalidArgument( + "The number of result elements does not match " + "the size of output shapes: ", + state.size(), " vs. ", output_shapes_.size()), + done); + for (int i = 0; i < state.size(); ++i) { + OP_REQUIRES_ASYNC( + ctx, state[i].dtype() == output_types_[i], + errors::InvalidArgument( + "The result does not match the expected type for " + "component ", + i, ". Expected: ", DataTypeString(output_types_[i]), + ". Actual: ", DataTypeString(state[i].dtype()), "."), + done); + OP_REQUIRES_ASYNC( + ctx, output_shapes_[i].IsCompatibleWith(state[i].shape()), + errors::InvalidArgument( + "The result does not match the expected shape for " + "component ", + i, ". Expected: ", output_shapes_[i].DebugString(), + ". Actual: ", state[i].shape().DebugString(), "."), + done); + ctx->set_output(i, state[i]); + } + done(); + }, + std::move(done))); } private: @@ -882,8 +943,7 @@ void IteratorGetNextOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) { std::vector components; bool end_of_sequence = false; - Status s = iterator->GetNext(IteratorContext(ctx), &components, - &end_of_sequence); + Status s = iterator->GetNext(ctx, &components, &end_of_sequence); // NOTE(mrry): We must unref the iterator before calling `done()`, to // avoid destruction races. iterator->Unref(); @@ -910,8 +970,7 @@ void IteratorGetNextSyncOp::Compute(OpKernelContext* ctx) { std::vector components; bool end_of_sequence = false; - OP_REQUIRES_OK(ctx, iterator->GetNext(IteratorContext(ctx), &components, - &end_of_sequence)); + OP_REQUIRES_OK(ctx, iterator->GetNext(ctx, &components, &end_of_sequence)); OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence")); for (int i = 0; i < components.size(); ++i) { @@ -933,8 +992,7 @@ void IteratorGetNextAsOptionalOp::ComputeAsync(OpKernelContext* ctx, std::vector components; bool end_of_sequence = false; - Status s = iterator->GetNext(IteratorContext(ctx), &components, - &end_of_sequence); + Status s = iterator->GetNext(ctx, &components, &end_of_sequence); // NOTE(mrry): We must unref the iterator before calling `done()`, to // avoid destruction races. iterator->Unref(); diff --git a/tensorflow/core/kernels/data/iterator_ops.h b/tensorflow/core/kernels/data/iterator_ops.h index ceeed061f57..09c951f72b8 100644 --- a/tensorflow/core/kernels/data/iterator_ops.h +++ b/tensorflow/core/kernels/data/iterator_ops.h @@ -40,14 +40,11 @@ class IteratorResource : public ResourceBase { : unbounded_thread_pool_(env, "tf_data_iterator_resource"), device_mgr_(std::move(device_mgr)), iterator_state_(std::make_shared( - std::move(flib_def), std::move(pflr), flr, nullptr /* iterator */)), + std::move(flib_def), std::move(pflr), flr, /*iterator=*/nullptr)), output_dtypes_(output_dtypes), output_shapes_(output_shapes) {} - Status GetNext(IteratorContext* ctx, std::vector* out_tensors, - bool* end_of_sequence); - - Status GetNext(IteratorContext&& ctx, std::vector* out_tensors, + Status GetNext(OpKernelContext* ctx, std::vector* out_tensors, bool* end_of_sequence); Status Save(SerializationContext* ctx, IteratorStateWriter* writer); @@ -75,22 +72,12 @@ class IteratorResource : public ResourceBase { function_handle_cache(absl::make_unique(flr)), iterator(std::move(iterator)) {} - State(std::shared_ptr flib_def, - std::shared_ptr pflr, - FunctionLibraryRuntime* flr, - std::unique_ptr function_handle_cache, - std::unique_ptr iterator) - : flib_def(flib_def), - flr(flr), - pflr(pflr), - function_handle_cache(std::move(function_handle_cache)), - iterator(std::move(iterator)) {} - std::shared_ptr flib_def; FunctionLibraryRuntime* flr = nullptr; // not owned. std::shared_ptr pflr; std::unique_ptr function_handle_cache; ResourceMgr resource_mgr; + CancellationManager cancellation_manager; std::unique_ptr iterator; }; diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc index 0305a85153e..99d6304255e 100644 --- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc +++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc @@ -16,6 +16,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h" #include "tensorflow/core/common_runtime/process_function_library_runtime.h" +#include "tensorflow/core/framework/cancellation.h" #include "tensorflow/core/framework/dataset.h" #include "tensorflow/core/framework/function.h" #include "tensorflow/core/framework/function_handle_cache.h" @@ -27,6 +28,7 @@ limitations under the License. #include "tensorflow/core/kernels/ops_util.h" #include "tensorflow/core/lib/core/refcount.h" #include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/lib/gtl/cleanup.h" #include "tensorflow/core/lib/random/random.h" #include "tensorflow/core/util/device_name_utils.h" @@ -93,16 +95,40 @@ class MultiDeviceIterator : public ResourceBase { } void GetNextFromShard(OpKernelContext* ctx, int shard_num, - int64 incarnation_id, - MultiDeviceIteratorCallback callback) { + int64 incarnation_id, std::function done) { tf_shared_lock l(mu_); IteratorContext::Params params(ctx); params.flr = flr_; params.function_handle_cache = function_handle_cache_.get(); params.resource_mgr = &resource_mgr_; params.thread_factory = unbounded_thread_pool_.get_thread_factory(); - + params.cancellation_manager = &cancellation_manager_; + std::function deregister_fn; + OP_REQUIRES_OK_ASYNC(ctx, + ConnectCancellationManagers( + ctx->cancellation_manager(), + params.cancellation_manager, &deregister_fn), + done); IteratorContext iter_ctx(std::move(params)); + MultiDeviceIteratorCallback callback = std::bind( + [ctx](const HostBufferElement& elem, const std::function& done, + const std::function& deregister_fn) { + // iterator->Unref(); + Status s = elem.status; + if (!s.ok()) { + ctx->SetStatus(s); + } else if (elem.end_of_sequence) { + ctx->SetStatus(errors::OutOfRange("End of sequence")); + } else { + for (int i = 0; i < elem.value.size(); ++i) { + ctx->set_output(i, elem.value[i]); + } + } + deregister_fn(); + done(); + }, + std::placeholders::_1, std::move(done), std::move(deregister_fn)); + multi_device_buffer_->GetNextFromShard(&iter_ctx, shard_num, incarnation_id, std::move(callback)); } @@ -124,6 +150,8 @@ class MultiDeviceIterator : public ResourceBase { ResourceMgr* resource_mgr() { return &resource_mgr_; } + CancellationManager* cancellation_manager() { return &cancellation_manager_; } + private: // A private class that uses a background thread to keep a per device buffer // full. @@ -356,6 +384,7 @@ class MultiDeviceIterator : public ResourceBase { const std::unique_ptr pflr_; const std::unique_ptr function_handle_cache_; ResourceMgr resource_mgr_; + CancellationManager cancellation_manager_; std::shared_ptr lib_def_ GUARDED_BY(mu_); int64 incarnation_id_ GUARDED_BY(mu_) = 0; @@ -544,6 +573,13 @@ class MultiDeviceIteratorInitOp : public OpKernel { params.flr = resource->flr(); params.function_handle_cache = resource->function_handle_cache(); params.resource_mgr = resource->resource_mgr(); + params.cancellation_manager = resource->cancellation_manager(); + std::function deregister_fn; + OP_REQUIRES_OK(ctx, ConnectCancellationManagers(ctx->cancellation_manager(), + params.cancellation_manager, + &deregister_fn)); + auto cleanup = gtl::MakeCleanup(std::move(deregister_fn)); + IteratorContext iter_ctx(std::move(params)); OP_REQUIRES_OK( ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator)); @@ -581,24 +617,7 @@ class MultiDeviceIteratorGetNextFromShardOp : public AsyncOpKernel { OP_REQUIRES_OK_ASYNC( ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator), done); - MultiDeviceIteratorCallback callback = std::bind( - [ctx](const HostBufferElement& elem, DoneCallback done) { - // iterator->Unref(); - Status s = elem.status; - if (!s.ok()) { - ctx->SetStatus(s); - } else if (elem.end_of_sequence) { - ctx->SetStatus(errors::OutOfRange("End of sequence")); - } else { - for (int i = 0; i < elem.value.size(); ++i) { - ctx->set_output(i, elem.value[i]); - } - } - done(); - }, - std::placeholders::_1, std::move(done)); - - iterator->GetNextFromShard(ctx, shard_num, incarnation_id, callback); + iterator->GetNextFromShard(ctx, shard_num, incarnation_id, std::move(done)); } }; diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc index 76a4e39650e..ec6cec063d1 100644 --- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc +++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc @@ -330,8 +330,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase { return Status::OK(); } - // Prefetches elements of the input, storing results in an internal - // buffer. + // Prefetches elements of the input, storing results in an internal buffer. // // It owns the iterator context passed to it. void PrefetchThread(const std::shared_ptr& ctx) { diff --git a/tensorflow/python/data/kernel_tests/map_test.py b/tensorflow/python/data/kernel_tests/map_test.py index d85caa96beb..98f1e6afb4d 100644 --- a/tensorflow/python/data/kernel_tests/map_test.py +++ b/tensorflow/python/data/kernel_tests/map_test.py @@ -19,6 +19,7 @@ from __future__ import print_function from collections import namedtuple import threading +import time import warnings from absl.testing import parameterized @@ -1094,6 +1095,30 @@ class MapTest(test_base.DatasetTestBase, parameterized.TestCase): dataset = dataset_ops.Dataset.from_tensors(constant_op.constant(1.0)) dataset.map(func) + @parameterized.named_parameters( + ("Sequential", None), + ("Parallel", 12), + ) + @test_util.run_v1_only("graph-mode specific test") + def testSkipEagerMapCancellation(self, num_parallel_calls): + # Checks that a cancellation of is threaded through to map transformation. + queue = data_flow_ops.FIFOQueue(10, dtypes.int32, ()) + + def fn(_): + return queue.dequeue() + + dataset = dataset_ops.Dataset.range(1).map( + fn, num_parallel_calls=num_parallel_calls) + get_next = self.getNext(dataset, requires_initialization=True) + + with self.cached_session() as sess: + thread = self.checkedThread(self.assert_op_cancelled, args=(get_next(),)) + thread.start() + time.sleep(0.2) + sess.close() + thread.join() + + # TODO(shivaniagarwal): separate out `map` and `map_with_legacy_function` tests # as later would not work in v2. @test_util.run_all_in_graph_and_eager_modes diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py index c831b135aac..f17f0180679 100644 --- a/tensorflow/python/data/kernel_tests/test_base.py +++ b/tensorflow/python/data/kernel_tests/test_base.py @@ -44,7 +44,7 @@ class DatasetTestBase(test.TestCase): dataset_ops.Dataset = dataset_ops.DatasetV1 def assert_op_cancelled(self, op): - with self.assertRaisesRegexp(errors.CancelledError, "was cancelled"): + with self.assertRaises(errors.CancelledError): self.evaluate(op) def assertValuesEqual(self, expected, actual): From 5c2c228604135c72db4bf5c84ef7ca76bd77d5a6 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Wed, 24 Jul 2019 12:47:16 -0700 Subject: [PATCH 0490/3053] Simplify graphdef2mlir/multiple-use-next-iteration.pbtxt test to be more targeted This test checks that a NextIteration node feeding two different merge nodes is properly imported, the CHECK lines are updated to focus on this. PiperOrigin-RevId: 259796268 --- .../multiple-use-next-iteration.pbtxt | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multiple-use-next-iteration.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multiple-use-next-iteration.pbtxt index 6baa4973407..b8d7cfeddf2 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multiple-use-next-iteration.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multiple-use-next-iteration.pbtxt @@ -1,5 +1,13 @@ # RUN: tf-mlir-translate -graphdef-to-mlir %s -o - | FileCheck %s +# Verify that a NextIteration node feeding two different merge nodes is properly +# Imported. + +# CHECK-LABEL: func @main() +# CHECK: %[[NEXTITERATION:[0-9]+]]:2 = "_tf.NextIteration.source" +# CHECK: "_tf.Merge"({{.*}}, %[[NEXTITERATION]]#0) +# CHECK: "_tf.Merge"({{.*}}, %[[NEXTITERATION]]#0) + node { name: "Const" op: "Const" @@ -137,14 +145,3 @@ versions { producer: 62 } -# CHECK: func @main() { -# CHECK-NEXT: %0:2 = "_tf.NextIteration.source"() {T = "tfdtype$DT_INT32", device = "", name = "NextIteration"} : () -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Add/y", value = dense<1> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %2:2 = "_tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Const", value = dense<0> : tensor} : () -> (tensor, !_tf.control) -# CHECK-NEXT: %3:2 = "_tf.Enter"(%2#0) {T = "tfdtype$DT_INT32", device = "", frame_name = "while_context", is_constant = false, name = "Enter", parallel_iterations = 10 : i64} : (tensor) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %4:3 = "_tf.Merge"(%3#0, %0#0) {N = 2 : i64, T = "tfdtype$DT_INT32", device = "", name = "Merge"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor, !_tf.control) -# CHECK-NEXT: %5:2 = "_tf.Add"(%4#0, %1#0) {T = "tfdtype$DT_INT32", device = "", name = "Add"} : (tensor<*xi32>, tensor) -> (tensor<*xi32>, !_tf.control) -# CHECK-NEXT: %6 = "_tf.NextIteration.sink"(%5#0) {T = "tfdtype$DT_INT32", device = "", name = "NextIteration"} : (tensor<*xi32>) -> !_tf.control -# CHECK-NEXT: %7:3 = "_tf.Merge"(%3#0, %0#0) {N = 2 : i64, T = "tfdtype$DT_INT32", device = "", name = "Use_NextIteration_Again"} : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor, !_tf.control) -# CHECK-NEXT: return -# CHECK-NEXT: } From 6125279e98bad5e1769fd7b6e1a39dd7cf2d49c1 Mon Sep 17 00:00:00 2001 From: Mehrdad Khatir Date: Wed, 24 Jul 2019 12:54:13 -0700 Subject: [PATCH 0491/3053] Added a new op: ragged.reduce_join PiperOrigin-RevId: 259797710 --- tensorflow/python/ops/ragged/BUILD | 20 +++ .../python/ops/ragged/ragged_dispatch.py | 9 +- .../python/ops/ragged/ragged_dispatch_test.py | 76 ++++++++++++ .../python/ops/ragged/ragged_math_ops.py | 71 ++++++----- .../python/ops/ragged/ragged_string_ops.py | 8 ++ .../ops/ragged/ragged_string_ops_test.py | 114 ++++++++++++++++++ tensorflow/python/ops/string_ops.py | 42 ++++--- 7 files changed, 292 insertions(+), 48 deletions(-) create mode 100644 tensorflow/python/ops/ragged/ragged_string_ops_test.py diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD index fbed2169677..2e0b6884b64 100644 --- a/tensorflow/python/ops/ragged/BUILD +++ b/tensorflow/python/ops/ragged/BUILD @@ -232,6 +232,7 @@ py_library( srcs_version = "PY2AND3", deps = [ ":ragged_array_ops", + ":ragged_math_ops", ":ragged_tensor", "//tensorflow/python:array_ops", "//tensorflow/python:dtypes", @@ -740,6 +741,25 @@ py_test( ], ) +py_test( + name = "ragged_string_ops_test", + srcs = ["ragged_string_ops_test.py"], + python_version = "PY3", + srcs_version = "PY2AND3", + deps = [ + ":ragged", + ":ragged_factory_ops", + ":ragged_string_ops", + ":ragged_tensor", + "//tensorflow/python:array_ops", + "//tensorflow/python:dtypes", + "//tensorflow/python:framework_test_lib", + "//tensorflow/python:platform_test", + "//tensorflow/python:tensor_shape", + "@absl_py//absl/testing:parameterized", + ], +) + py_test( name = "ragged_constant_value_op_test", srcs = ["ragged_constant_value_op_test.py"], diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py index 50d9079a287..0f67c8c6edc 100644 --- a/tensorflow/python/ops/ragged/ragged_dispatch.py +++ b/tensorflow/python/ops/ragged/ragged_dispatch.py @@ -37,6 +37,7 @@ from tensorflow.python.ops.ragged import ragged_concat_ops from tensorflow.python.ops.ragged import ragged_gather_ops from tensorflow.python.ops.ragged import ragged_math_ops from tensorflow.python.ops.ragged import ragged_squeeze_op +from tensorflow.python.ops.ragged import ragged_string_ops from tensorflow.python.ops.ragged import ragged_tensor from tensorflow.python.ops.ragged import ragged_tensor_shape from tensorflow.python.ops.ragged import ragged_util @@ -388,7 +389,7 @@ _BINARY_ELEMENTWISE_OPS = [ # We don't need to register a separate delegation handler for these v1 ops, # since they delegate to the v2 ops (which already have a handler). But we # still want to include them in the ragged_op_list() output. -_V1_OPS_THAT_DELEGATE_TO_V2_OPS = [ +_V2_OPS_THAT_ARE_DELEGATED_TO_FROM_V1_OPS = [ math_ops.reduce_sum, math_ops.reduce_prod, math_ops.reduce_min, @@ -396,6 +397,9 @@ _V1_OPS_THAT_DELEGATE_TO_V2_OPS = [ math_ops.reduce_mean, math_ops.reduce_any, math_ops.reduce_all, + string_ops.string_to_number, + string_ops.string_to_hash_bucket, + string_ops.reduce_join_v2, ] @@ -465,6 +469,7 @@ _RAGGED_DISPATCH_OPS = [ ['data', 'segment_ids']), (math_ops.unsorted_segment_sqrt_n, ragged_math_ops.segment_sqrt_n, ['data', 'segment_ids']), + (string_ops.reduce_join_v2, ragged_string_ops.reduce_join, ['inputs']), (math_ops.reduce_sum, ragged_math_ops.reduce_sum, ['input_tensor']), (math_ops.reduce_prod, ragged_math_ops.reduce_prod, ['input_tensor']), (math_ops.reduce_min, ragged_math_ops.reduce_min, ['input_tensor']), @@ -527,7 +532,7 @@ def _ragged_op_signature(op, ragged_args): def _op_is_in_tf_version(op, version): if version == 1: return (tf_export.get_v1_names(tf_decorator.unwrap(op)[1]) or - op in _V1_OPS_THAT_DELEGATE_TO_V2_OPS) + op in _V2_OPS_THAT_ARE_DELEGATED_TO_FROM_V1_OPS) elif version == 2: return tf_export.get_v2_names(tf_decorator.unwrap(op)[1]) else: diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py index 2c54cbce917..246a0255c72 100644 --- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py +++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py @@ -33,6 +33,7 @@ from tensorflow.python.ops import gen_bitwise_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import parsing_ops from tensorflow.python.ops import string_ops +from tensorflow.python.ops.ragged import ragged_dispatch from tensorflow.python.ops.ragged import ragged_factory_ops from tensorflow.python.ops.ragged import ragged_tensor from tensorflow.python.platform import googletest @@ -672,6 +673,25 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase, 1 }, expected=[True, True]), + dict( + op=string_ops.reduce_join, + kwargs={ + 'inputs': + ragged_factory_ops.constant_value( + [[b'this', b'is', b'a', b'test', b'for', b'ragged', + b'tensors'], + [b'please', b'do', b'not', b'panic', b'!']]), + 'axis': + 0, + 'keepdims': + False, + 'separator': + '' + }, + expected=[ + b'thisplease', b'isdo', b'anot', b'testpanic', b'for!', b'ragged', + b'tensors' + ]), dict( op=math_ops.reduce_all, kwargs={ @@ -714,6 +734,62 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase, result = op(*args, **kwargs) self.assertAllEqual(result, expected) + def test_ragged_op_list(self): + # Ops that should be listed as supported in both v1 and v2. + supported_ops = [ + 'bitwise.bitwise_and', 'bitwise.bitwise_or', 'bitwise.bitwise_xor', + 'bitwise.invert', 'bitwise.left_shift', 'bitwise.right_shift', + 'clip_by_value', 'concat', 'debugging.check_numerics', 'dtypes.cast', + 'dtypes.complex', 'dtypes.saturate_cast', 'expand_dims', 'gather_nd', + 'gather', 'identity', 'io.decode_base64', 'io.decode_compressed', + 'io.encode_base64', 'math.abs', 'math.acos', 'math.acosh', 'math.add_n', + 'math.add', 'math.angle', 'math.asin', 'math.asinh', 'math.atan2', + 'math.atan', 'math.atanh', 'math.ceil', 'math.conj', 'math.cos', + 'math.cosh', 'math.digamma', 'math.divide_no_nan', 'math.divide', + 'math.equal', 'math.erf', 'math.erfc', 'math.exp', 'math.expm1', + 'math.floor', 'math.floordiv', 'math.floormod', 'math.greater_equal', + 'math.greater', 'math.imag', 'math.is_finite', 'math.is_inf', + 'math.is_nan', 'math.less_equal', 'math.less', 'math.lgamma', + 'math.log1p', 'math.log_sigmoid', 'math.log', 'math.logical_and', + 'math.logical_not', 'math.logical_or', 'math.logical_xor', + 'math.maximum', 'math.minimum', 'math.multiply', 'math.negative', + 'math.not_equal', 'math.pow', 'math.real', 'math.reciprocal', + 'math.reduce_any', 'math.reduce_max', 'math.reduce_mean', + 'math.reduce_min', 'math.reduce_prod', 'math.reduce_sum', 'math.rint', + 'math.round', 'math.rsqrt', 'math.sign', 'math.sin', 'math.sinh', + 'math.sqrt', 'math.square', 'math.squared_difference', 'math.subtract', + 'math.tan', 'math.truediv', 'math.unsorted_segment_max', + 'math.unsorted_segment_mean', 'math.unsorted_segment_min', + 'math.unsorted_segment_prod', 'math.unsorted_segment_sqrt_n', + 'math.unsorted_segment_sum', 'ones_like', 'rank', 'realdiv', + 'reduce_all', 'size', 'squeeze', 'stack', 'strings.as_string', + 'strings.join', 'strings.length', 'strings.reduce_join', + 'strings.regex_full_match', 'strings.regex_replace', 'strings.strip', + 'strings.substr', 'strings.to_hash_bucket_fast', + 'strings.to_hash_bucket_strong', 'strings.to_hash_bucket', + 'strings.to_number', 'strings.unicode_script', 'tile', 'truncatediv', + 'truncatemod', 'zeros_like' + ] + + # Ops that should be listed as supported in v1 only. + # TODO(edloper): Add a dispatch for where_v2. + supported_ops_v1 = ['batch_gather', 'where'] + + # Ops that should be listed as supported in v2 only. + supported_ops_v2 = [] + + v1_ragged_ops = ragged_dispatch.ragged_op_list(tf_version=1) + for element in supported_ops + supported_ops_v1: + self.assertIn(element, v1_ragged_ops) + for element in supported_ops_v2: + self.assertNotIn(element, v1_ragged_ops) + + v2_ragged_ops = ragged_dispatch.ragged_op_list(tf_version=2) + for element in supported_ops + supported_ops_v2: + self.assertIn(element, v2_ragged_ops) + for element in supported_ops_v1: + self.assertNotIn(element, v2_ragged_ops) + if __name__ == '__main__': googletest.main() diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py index 7e27cd29377..39bd93e527f 100644 --- a/tensorflow/python/ops/ragged/ragged_math_ops.py +++ b/tensorflow/python/ops/ragged/ragged_math_ops.py @@ -159,6 +159,7 @@ def _ragged_segment_aggregate(unsorted_segment_op, data, segment_ids, num_segments, + separator='', name=None): """Aggregates along segments of a RaggedTensor using `unsorted_segment_op`. @@ -181,6 +182,8 @@ def _ragged_segment_aggregate(unsorted_segment_op, `int32`. `segment_ids.shape` must be a prefix of `data.shape`. `segment_ids` is not required to be sorted. num_segments: An `int32` or `int64` scalar. + separator: An optional string. Defaults to "". The separator to + use when joining. Only used for string types. name: A name prefix for the returned tensor (optional). Returns: @@ -192,7 +195,12 @@ def _ragged_segment_aggregate(unsorted_segment_op, """ if not (ragged_tensor.is_ragged(data) or ragged_tensor.is_ragged(segment_ids)): - return unsorted_segment_op(data, segment_ids, num_segments, name) + if data.dtype == dtypes.string: + # It uses unsorted_segment_join. + return unsorted_segment_op(data, segment_ids, num_segments, separator, + name) + else: + return unsorted_segment_op(data, segment_ids, num_segments, name) with ops.name_scope(name, 'RaggedSegment', [data, segment_ids, num_segments]) as name: @@ -213,7 +221,8 @@ def _ragged_segment_aggregate(unsorted_segment_op, message='segment_ids.shape must be a prefix of data.shape') with ops.control_dependencies([check_splits]): return _ragged_segment_aggregate(unsorted_segment_op, data.values, - segment_ids.values, num_segments, name) + segment_ids.values, num_segments, + separator) # Find the length of each row in data. (shape=[data_nrows]) data_row_lengths = data.row_splits[1:] - data.row_splits[:-1] @@ -407,12 +416,13 @@ _RAGGED_REDUCE_ANY_EXAMPLE = """ """ -def _ragged_reduce_aggregate(reduce_op, - unsorted_segment_op, - rt_input, - axis, - keepdims, - name=None): +def ragged_reduce_aggregate(reduce_op, + unsorted_segment_op, + rt_input, + axis, + keepdims, + separator='', + name=None): """Aggregates across axes of a RaggedTensor using the given `Tensor` ops. Reduces `rt_input` along the dimensions given in `axis`. The rank of the @@ -437,6 +447,8 @@ def _ragged_reduce_aggregate(reduce_op, given set of axes), or a `Tensor` with a constant value. Must be in the range `[0, rt_input.rank)`. keepdims: If true, retains reduced dimensions with length 1. + separator: An optional string. Defaults to ''. The separator to use when + joining. Used only when input type is string. name: A name prefix for the returned tensor (optional). Returns: @@ -484,10 +496,12 @@ def _ragged_reduce_aggregate(reduce_op, # does not work for reduce_mean.) However, reducing multiple axes at # once will probably require a nontrivial c++ op. axis = sorted(axis) - inner_reduced = _ragged_reduce_aggregate(reduce_op, unsorted_segment_op, - rt_input, axis[-1], keepdims) - return _ragged_reduce_aggregate(reduce_op, unsorted_segment_op, - inner_reduced, axis[:-1], keepdims) + inner_reduced = ragged_reduce_aggregate(reduce_op, unsorted_segment_op, + rt_input, axis[-1], keepdims, + separator) + return ragged_reduce_aggregate(reduce_op, unsorted_segment_op, + inner_reduced, axis[:-1], keepdims, + separator) rt_input = ragged_tensor.convert_to_tensor_or_ragged_tensor( rt_input, name='rt_input') @@ -500,48 +514,49 @@ def _ragged_reduce_aggregate(reduce_op, num_segments = math_ops.maximum(math_ops.reduce_max(row_lengths), 0) segment_ids = range(row_lengths).values return _ragged_segment_aggregate(unsorted_segment_op, rt_input.values, - segment_ids, num_segments) + segment_ids, num_segments, separator) elif axis == 1: # out[i_0, i_1, i_2, ..., i_N] = sum_{j} rt_input[i_0, j, i_2, ..., i_N] num_segments = array_ops.shape(rt_input.row_splits)[0] - 1 segment_ids = segment_id_ops.row_splits_to_segment_ids( rt_input.row_splits) return _ragged_segment_aggregate(unsorted_segment_op, rt_input.values, - segment_ids, num_segments) + segment_ids, num_segments, separator) else: # out[i_0, ..., i_[axis-1], i_axis+1], ..., i_N] = # sum_{j} rt_input [i_0, ..., i_[axis-1], j, i_axis+1], ..., i_N] return rt_input.with_values( - _ragged_reduce_aggregate(reduce_op, unsorted_segment_op, - rt_input.values, axis - 1, keepdims)) + ragged_reduce_aggregate(reduce_op, unsorted_segment_op, + rt_input.values, axis - 1, keepdims, + separator)) def reduce_sum(input_tensor, axis=None, keepdims=None, name=None): """For docs, see: _RAGGED_REDUCE_DOCSTRING.""" - return _ragged_reduce_aggregate(math_ops.reduce_sum, - math_ops.unsorted_segment_sum, input_tensor, - axis, keepdims, name or 'RaggedReduceSum') + return ragged_reduce_aggregate(math_ops.reduce_sum, + math_ops.unsorted_segment_sum, input_tensor, + axis, keepdims, name or 'RaggedReduceSum') def reduce_prod(input_tensor, axis=None, keepdims=None, name=None): """For docs, see: _RAGGED_REDUCE_DOCSTRING.""" - return _ragged_reduce_aggregate(math_ops.reduce_prod, - math_ops.unsorted_segment_prod, input_tensor, - axis, keepdims, name or 'RaggedReduceProd') + return ragged_reduce_aggregate(math_ops.reduce_prod, + math_ops.unsorted_segment_prod, input_tensor, + axis, keepdims, name or 'RaggedReduceProd') def reduce_min(input_tensor, axis=None, keepdims=None, name=None): """For docs, see: _RAGGED_REDUCE_DOCSTRING.""" - return _ragged_reduce_aggregate(math_ops.reduce_min, - math_ops.unsorted_segment_min, input_tensor, - axis, keepdims, name or 'RaggedReduceMin') + return ragged_reduce_aggregate(math_ops.reduce_min, + math_ops.unsorted_segment_min, input_tensor, + axis, keepdims, name or 'RaggedReduceMin') def reduce_max(input_tensor, axis=None, keepdims=None, name=None): """For docs, see: _RAGGED_REDUCE_DOCSTRING.""" - return _ragged_reduce_aggregate(math_ops.reduce_max, - math_ops.unsorted_segment_max, input_tensor, - axis, keepdims, name or 'RaggedReduceMax') + return ragged_reduce_aggregate(math_ops.reduce_max, + math_ops.unsorted_segment_max, input_tensor, + axis, keepdims, name or 'RaggedReduceMax') def reduce_mean(input_tensor, axis=None, keepdims=None, name=None): diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py index 4b225da2edd..ed52e9a88fa 100644 --- a/tensorflow/python/ops/ragged/ragged_string_ops.py +++ b/tensorflow/python/ops/ragged/ragged_string_ops.py @@ -24,6 +24,7 @@ from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_string_ops from tensorflow.python.ops import string_ops from tensorflow.python.ops.ragged import ragged_array_ops +from tensorflow.python.ops.ragged import ragged_math_ops from tensorflow.python.ops.ragged import ragged_tensor from tensorflow.python.util import deprecation from tensorflow.python.util.tf_export import tf_export @@ -642,3 +643,10 @@ def strings_split_v1(input=None, sep=None, maxsplit=-1, # pylint: disable=redef return ragged_result else: raise ValueError("result_type must be 'RaggedTensor' or 'SparseTensor'.") + + +def reduce_join(inputs, axis=None, keepdims=None, separator="", name=None): + """For docs, see: _RAGGED_REDUCE_DOCSTRING.""" + return ragged_math_ops.ragged_reduce_aggregate( + string_ops.reduce_join, string_ops.unsorted_segment_join, inputs, axis, + keepdims, separator, name or "RaggedSegmentJoin") diff --git a/tensorflow/python/ops/ragged/ragged_string_ops_test.py b/tensorflow/python/ops/ragged/ragged_string_ops_test.py new file mode 100644 index 00000000000..52f88053ed8 --- /dev/null +++ b/tensorflow/python/ops/ragged/ragged_string_ops_test.py @@ -0,0 +1,114 @@ +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for ragged_string_ops.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl.testing import parameterized +from tensorflow.python.framework import tensor_shape +from tensorflow.python.framework import test_util +from tensorflow.python.ops.ragged import ragged_factory_ops +from tensorflow.python.ops.ragged import ragged_string_ops +from tensorflow.python.platform import googletest + + +@test_util.run_all_in_graph_and_eager_modes +class RaggedStringOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase): + + def test_rank_one(self): + input_array = [b'this', b'is', b'a', b'test'] + truth = b'thisisatest' + truth_shape = [] + with self.cached_session(): + output = ragged_string_ops.reduce_join( + inputs=input_array, axis=-1, keepdims=False, separator='') + output_array = self.evaluate(output) + self.assertAllEqual(truth, output_array) + self.assertAllEqual(truth_shape, output.get_shape()) + + @parameterized.parameters([ + { + 'input_array': [[ + b'this', b'is', b'a', b'test', b'for', b'ragged', b'tensors' + ], [b'please', b'do', b'not', b'panic', b'!']], + 'axis': 0, + 'keepdims': False, + 'truth': [ + b'thisplease', b'isdo', b'anot', b'testpanic', b'for!', b'ragged', + b'tensors' + ], + 'truth_shape': [7], + }, + { + 'input_array': [[ + b'this', b'is', b'a', b'test', b'for', b'ragged', b'tensors' + ], [b'please', b'do', b'not', b'panic', b'!']], + 'axis': 1, + 'keepdims': False, + 'truth': [b'thisisatestforraggedtensors', b'pleasedonotpanic!'], + 'truth_shape': [2], + }, + { + 'input_array': [[[b't', b'h', b'i', b's'], [b'i', b's'], [b'a'], + [b't', b'e', b's', b't']], + [[b'p', b'l', b'e', b'a', b's', b'e'], + [b'p', b'a', b'n', b'i', b'c']]], + 'axis': -1, + 'keepdims': False, + 'truth': [[b'this', b'is', b'a', b'test'], [b'please', b'panic']], + 'truth_shape': [2, None], + 'separator': '', + }, + { + 'input_array': [[[[b't'], [b'h'], [b'i'], [b's']], [[b'i', b's']], + [[b'a', b'n']], [[b'e'], [b'r'], [b'r']]], + [[[b'p'], [b'l'], [b'e'], [b'a'], [b's'], [b'e']], + [[b'p'], [b'a'], [b'n'], [b'i'], [b'c']]]], + 'axis': -1, + 'keepdims': False, + 'truth': [[[b't', b'h', b'i', b's'], [b'is'], [b'an'], + [b'e', b'r', b'r']], + [[b'p', b'l', b'e', b'a', b's', b'e'], + [b'p', b'a', b'n', b'i', b'c']]], + 'truth_shape': [2, None, None], + 'separator': '', + }, + ]) + def test_different_ranks(self, + input_array, + axis, + keepdims, + truth, + truth_shape, + separator=''): + with self.cached_session(): + input_tensor = ragged_factory_ops.constant(input_array) + output = ragged_string_ops.reduce_join( + inputs=input_tensor, + axis=axis, + keepdims=keepdims, + separator=separator) + output_array = self.evaluate(output) + self.assertAllEqual(truth, output_array) + if all(isinstance(s, tensor_shape.Dimension) for s in output.shape): + output_shape = [dim.value for dim in output.shape] + else: + output_shape = output.shape + self.assertAllEqual(truth_shape, output_shape) + + +if __name__ == '__main__': + googletest.main() diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py index 507339b55bb..dced1400287 100644 --- a/tensorflow/python/ops/string_ops.py +++ b/tensorflow/python/ops/string_ops.py @@ -304,13 +304,8 @@ def string_split_v2(source, sep=None, maxsplit=-1): return sparse_tensor.SparseTensor(indices, values, shape) -def _reduce_join_reduction_dims(x, axis, reduction_indices): - """Returns range(rank(x) - 1, 0, -1) if reduction_indices is None.""" - # TODO(aselle): Remove this after deprecation - if reduction_indices is not None: - if axis is not None: - raise ValueError("Can't specify both 'axis' and 'reduction_indices'.") - axis = reduction_indices +def _reduce_join_reduction_dims(x, axis): + """Returns range(rank(x) - 1, 0, -1) if axis is None; or axis otherwise.""" if axis is not None: return axis else: @@ -324,6 +319,9 @@ def _reduce_join_reduction_dims(x, axis, reduction_indices): @tf_export(v1=["strings.reduce_join", "reduce_join"]) +@deprecation.deprecated_args(None, + "keep_dims is deprecated, use keepdims instead", + "keep_dims") @deprecation.deprecated_endpoints("reduce_join") def reduce_join(inputs, axis=None, # pylint: disable=missing-docstring keep_dims=None, @@ -331,30 +329,38 @@ def reduce_join(inputs, axis=None, # pylint: disable=missing-docstring name=None, reduction_indices=None, keepdims=None): - keep_dims = deprecation.deprecated_argument_lookup( - "keepdims", keepdims, "keep_dims", keep_dims) + keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims, + "keep_dims", keep_dims) if keep_dims is None: keep_dims = False - inputs_t = ops.convert_to_tensor(inputs) - reduction_indices = _reduce_join_reduction_dims( - inputs_t, axis, reduction_indices) - return gen_string_ops.reduce_join( - inputs=inputs_t, - reduction_indices=reduction_indices, - keep_dims=keep_dims, + axis = deprecation.deprecated_argument_lookup("axis", axis, + "reduction_indices", + reduction_indices) + return reduce_join_v2( + inputs=inputs, + axis=axis, + keepdims=keepdims, separator=separator, name=name) @tf_export("strings.reduce_join", v1=[]) +@dispatch.add_dispatch_support def reduce_join_v2( # pylint: disable=missing-docstring inputs, axis=None, keepdims=False, separator="", name=None): - return reduce_join( - inputs, axis, keep_dims=keepdims, separator=separator, name=name) + with ops.name_scope(None, "ReduceJoin", [inputs, axis]): + inputs_t = ops.convert_to_tensor(inputs) + axis = _reduce_join_reduction_dims(inputs_t, axis) + return gen_string_ops.reduce_join( + inputs=inputs_t, + reduction_indices=axis, + keep_dims=keepdims, + separator=separator, + name=name) reduce_join.__doc__ = deprecation.rewrite_argument_docstring( From 711d4fe8132c3cdd70c3230997189d1b87c695de Mon Sep 17 00:00:00 2001 From: Sundeep Gottipati <42554856+bananabowl@users.noreply.github.com> Date: Wed, 24 Jul 2019 13:15:25 -0700 Subject: [PATCH 0492/3053] Mention other default learning rate changes in 1.14 relnotes --- RELEASE.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index cc0d3e6aaee..debbba723dd 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -43,7 +43,11 @@ * Transitive dependencies on :pooling_ops were removed. Some users may need to add explicit dependencies on :pooling_ops if they reference the operators from that library. -* tf.keras.optimizers.Adadelta default learning rate changed from 1.0 to .001 +* tf.keras.optimizers default learning rate changes: + * Adadelta: 1.000 to 0.001 + * Adagrad: 0.01 to 0.001 + * Adamax: 0.002 to 0.001 + * NAdam: 0.002 to 0.001 ## Bug Fixes and Other Changes From 59a1603c7d5ce46b344c8ead8b4cac905a4b03de Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Wed, 24 Jul 2019 12:57:16 -0700 Subject: [PATCH 0493/3053] Simplify graphdef2mlir/prune_unused_nodes.pbtxt test to be more targeted This test intends to check the pruning behavior, the CHECK lines are updated to CHECK the absence of the pruned node in the output instead of positively checking everything else. PiperOrigin-RevId: 259798258 --- .../tests/graphdef2mlir/prune_unused_nodes.pbtxt | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt index f57a42ae287..7715a0eb9df 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/prune_unused_nodes.pbtxt @@ -1,5 +1,10 @@ # RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-prune-unused-nodes -tf-input-arrays=input0,input1,unused_input -tf-input-data-types=DT_INT32,DT_INT32,DT_INT32 -tf-input-shapes=10:10:10 -tf-output-arrays=Add -o - | FileCheck %s +# Verify that an unused Node (here named "Prune") isn't converted when we +# request pruning on import. +# CHECK-LABEL: func @main +# CHECK-NOT: Prune + node { name: "Prune" op: "Const" @@ -66,13 +71,3 @@ node { versions { producer: 27 } - -# CHECK: func @main(%arg0: tensor<10xi32>, %arg1: tensor<10xi32>, %arg2: tensor<10xi32>) -> tensor<10xi32> -# CHECK-NEXT: attributes {tf.entry_function = {inputs = "input0, input1, unused_input", outputs = "Add"}} { -# CHECK-NEXT: %0:2 = "_tf.Placeholder.input"(%arg0) {device = "", dtype = "tfdtype$DT_INT32", name = "input0", shape = "tfshape$dim { size: 10 }"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) -# CHECK-NEXT: %1:2 = "_tf.Placeholder.input"(%arg1) {device = "", dtype = "tfdtype$DT_INT32", name = "input1", shape = "tfshape$dim { size: 10 }"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) -# CHECK-NEXT: %2:2 = "_tf.Add"(%0#0, %1#0) {T = "tfdtype$DT_INT32", device = "", name = "Add"} : (tensor<10xi32>, tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) -# CHECK-NEXT: %3:2 = "_tf.Placeholder.input"(%arg2) {device = "", dtype = "tfdtype$DT_INT32", name = "unused_input", shape = "tfshape$dim { size: 10 }"} : (tensor<10xi32>) -> (tensor<10xi32>, !_tf.control) -# CHECK-NEXT: return %2#0 : tensor<10xi32> -# CHECK-NEXT: } - From d2c9498b5ad41b64ef75c1142b74c8df7900346b Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Wed, 24 Jul 2019 13:03:50 -0700 Subject: [PATCH 0494/3053] Simplify graphdef2mlir/graph-library.pbtxt test to be more targeted This test verifies that functions from the library are properly imported, the CHECK lines are updated to target this in particular. PiperOrigin-RevId: 259799838 --- .../tests/graphdef2mlir/graph-library.pbtxt | 22 +++++++++---------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt index 83ca4466869..760dffd36f1 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt @@ -36,15 +36,13 @@ versions { min_consumer: 12 } -# CHECK: func @main() { -# CHECK-NEXT: %0 = "_tf.foo0"() {device = "", name = "unnamed"} : () -> !_tf.control -# CHECK-NEXT: %1 = "_tf.bar0"() {device = "", name = "unnamed1"} : () -> !_tf.control -# CHECK-NEXT: return -# CHECK-NEXT: } -# CHECK: func @foo0() { -# CHECK-NEXT: %0 = "_tf.bar0"() {device = "", name = "unnamed"} : () -> !_tf.control -# CHECK-NEXT: return -# CHECK-NEXT: } -# CHECK: func @bar0() { -# CHECK-NEXT: return -# CHECK-NEXT: } +# Verify that functions from the library are properly imported. + +# CHECK-LABEL: func @main() { +# CHECK: "_tf.foo0"() +# CHECK: "_tf.bar0"() + +# CHECK-LABEL: func @foo0() { +# CHECK: "_tf.bar0"() + +# CHECK-LABEL: func @bar0() { From bde30027236891f3c3f35e931c6f01f890e11ff4 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 13:15:47 -0700 Subject: [PATCH 0495/3053] Rename markdown file to README so it appears rendered in GitHub, and revise the instructions for simplicity (don't use bazel; just run the .py) PiperOrigin-RevId: 259802492 --- tensorflow/lite/examples/python/README.md | 47 +++++++++++++++++ .../lite/examples/python/label_image.md | 50 ------------------- 2 files changed, 47 insertions(+), 50 deletions(-) create mode 100644 tensorflow/lite/examples/python/README.md delete mode 100644 tensorflow/lite/examples/python/label_image.md diff --git a/tensorflow/lite/examples/python/README.md b/tensorflow/lite/examples/python/README.md new file mode 100644 index 00000000000..b5ad7d1a412 --- /dev/null +++ b/tensorflow/lite/examples/python/README.md @@ -0,0 +1,47 @@ +# TensorFlow Lite Python image classification demo + +This `label_image.py` script shows how you can load a pre-trained and converted +TensorFlow Lite model and use it to recognize objects in images. The Python +script accepts arguments specifying the model to use, the corresponding labels +file, and the image to process. + +Before you begin, +make sure you [have TensorFlow installed](https://www.tensorflow.org/install). + + +## Download sample model and image + +You can use any compatible model, but the following MobileNet v1 model offers +a good demonstration of a model trained to recognize 1,000 different objects. + +``` +# Get photo +curl https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/lite/examples/label_image/testdata/grace_hopper.bmp > /tmp/grace_hopper.bmp +# Get model +curl http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz | tar xzv -C /tmp +# Get labels +curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz | tar xzv -C /tmp mobilenet_v1_1.0_224/labels.txt + +mv /tmp/mobilenet_v1_1.0_224/labels.txt /tmp/ +``` + +## Run the sample + +Note: Instead use `python` if you're using Python 2.x. + +``` +python3 label_image.py \ + --model_file /tmp/mobilenet_v1_1.0_224.tflite \ + --label_file /tmp/labels.txt \ + --image /tmp/grace_hopper.bmp +``` + +You should see results like this: + +``` +0.728693: military uniform +0.116163: Windsor tie +0.035517: bow tie +0.014874: mortarboard +0.011758: bolo tie +``` diff --git a/tensorflow/lite/examples/python/label_image.md b/tensorflow/lite/examples/python/label_image.md deleted file mode 100644 index b4ec42f5259..00000000000 --- a/tensorflow/lite/examples/python/label_image.md +++ /dev/null @@ -1,50 +0,0 @@ - -With model, input image (grace_hopper.bmp), and labels file (labels.txt) -in /tmp. - -The example input image and labels file are from TensorFlow repo and -MobileNet V1 model files. - -``` -curl https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/lite/examples/label_image/testdata/grace_hopper.bmp > /tmp/grace_hopper.bmp - -curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz | tar xzv -C /tmp mobilenet_v1_1.0_224/labels.txt -mv /tmp/mobilenet_v1_1.0_224/labels.txt /tmp/ - -``` - -Run - -``` -curl http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224_quant.tgz | tar xzv -C /tmp -bazel run --config opt //tensorflow/lite/examples/python:label_image -``` - -We can get results like - -``` -0.470588: military uniform -0.337255: Windsor tie -0.047059: bow tie -0.031373: mortarboard -0.019608: suit -``` - -Run - -``` -curl http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz | tar xzv -C /tmp -bazel run --config opt //tensorflow/lite/examples/python:label_image \ --- --model_file /tmp/mobilenet_v1_1.0_224.tflite -``` - -We can get results like -``` -0.728693: military uniform -0.116163: Windsor tie -0.035517: bow tie -0.014874: mortarboard -0.011758: bolo tie -``` - -Check [models](../../g3doc/models.md) for models hosted by Google. From 2a309a6dadf2c799fea61c2b3fb91cf91cef8cad Mon Sep 17 00:00:00 2001 From: Saran Tunyasuvunakool Date: Wed, 24 Jul 2019 13:36:31 -0700 Subject: [PATCH 0496/3053] Remove "_DEBUG" from the `defines` list for LLVM. PiperOrigin-RevId: 259806830 --- third_party/llvm/llvm.bzl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/llvm/llvm.bzl b/third_party/llvm/llvm.bzl index efb62a4644f..8b0fdec0482 100644 --- a/third_party/llvm/llvm.bzl +++ b/third_party/llvm/llvm.bzl @@ -354,7 +354,7 @@ llvm_defines = select({ "UNICODE", "_UNICODE", ], - "//conditions:default": ["_DEBUG"], + "//conditions:default": [], }) + [ "LLVM_ENABLE_STATS", "__STDC_LIMIT_MACROS", From 8da9e29aeb7f75e01bc476af42e1ee5e8ca48c5a Mon Sep 17 00:00:00 2001 From: Edward Loper Date: Wed, 24 Jul 2019 13:45:01 -0700 Subject: [PATCH 0497/3053] Improve tf.while_loop shape_invariant handling for TypeSpecs PiperOrigin-RevId: 259808483 --- .../kernel_tests/control_flow_ops_py_test.py | 45 +++++++++++++++++++ tensorflow/python/ops/control_flow_ops.py | 8 ++++ 2 files changed, 53 insertions(+) diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py index 9bc9f303d91..bb7f7f64a44 100644 --- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py +++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py @@ -43,6 +43,7 @@ from tensorflow.python.framework import function from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_shape +from tensorflow.python.framework import tensor_spec from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops @@ -2117,6 +2118,50 @@ class ControlFlowTest(test.TestCase, parameterized.TestCase): self.assertTrue(r.values.row_splits.shape.as_list() in ([6], [None])) self.assertTrue(r.values.values.shape.as_list() in ([49], [None])) + def testWhileShapeInvariantTensorSpec(self): + i = constant_op.constant(0) + x = constant_op.constant([1]) + c = lambda i, _: i < 10 + b = lambda i, x: (i + 1, array_ops.stack([x, x])) + shape_invariants = [ + tensor_spec.TensorSpec([], dtype=dtypes.int32), + tensor_spec.TensorSpec(None, dtype=dtypes.int32)] + control_flow_ops.while_loop(c, b, [i, x], shape_invariants) + + # TODO(b/131265085) Remove this decorator when bug is fixed. + @test_util.build_as_function_and_v1_graph + def testWhileShapeInvariantWrongTypeSpecType(self): + c = lambda i, _: i < 10 + b = lambda i, x: (i + 1, x) + i = constant_op.constant(0) + x = sparse_tensor.SparseTensor([[0]], [1.0], [10]) + shape_invariants = [ + tensor_spec.TensorSpec([], dtype=dtypes.int32), + sparse_tensor.SparseTensorSpec([None])] + control_flow_ops.while_loop(c, b, [i, x], shape_invariants) + + x2 = constant_op.constant([1]) + with self.assertRaises(TypeError): + control_flow_ops.while_loop(c, b, [i, x2], shape_invariants) + + x3 = ragged_factory_ops.constant([[1, 2], [3]]) + with self.assertRaises(TypeError): + control_flow_ops.while_loop(c, b, [i, x3], shape_invariants) + + i2 = constant_op.constant(0.0) + with self.assertRaises(TypeError): + control_flow_ops.while_loop(c, b, [i2, x], shape_invariants) + + # TODO(b/131265085) Remove this decorator when bug is fixed. + @test_util.build_as_function_and_v1_graph + def testWhileShapeInvariantBadType(self): + i = constant_op.constant(0) + x = constant_op.constant([1]) + c = lambda i, _: i < 10 + b = lambda i, x: (i + 1, x) + with self.assertRaises((ValueError, TypeError)): + control_flow_ops.while_loop(c, b, [i, x], ["foo", "bar"]) + def _testNestedWhile_1(self, use_gpu): with self.cached_session(use_gpu=use_gpu): n = constant_op.constant(0) diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py index 4f719086123..d06b9e82cc1 100644 --- a/tensorflow/python/ops/control_flow_ops.py +++ b/tensorflow/python/ops/control_flow_ops.py @@ -482,6 +482,12 @@ def _get_shape_invariant(var, shape=None): elif shape is None: return var.shape + elif isinstance(shape, tensor_spec.TensorSpec): + if var.dtype != shape.dtype: + raise TypeError("TensorSpec %r is not compatible with %r" % (shape, var)) + return shape.shape + elif isinstance(shape, type_spec.TypeSpec): + raise TypeError("TypeSpec %r is not compatible with %r" % (shape, var)) else: return shape @@ -498,6 +504,8 @@ def _shape_invariant_to_type_spec(var, shape): A `TypeSpec` for `var`, consistent with the given shape. """ if isinstance(shape, type_spec.TypeSpec): + if not shape.is_compatible_with(var): + raise TypeError("TypeSpec %r is not compatible with %r" % (shape, var)) return shape elif not isinstance(shape, tensor_shape.TensorShape): raise TypeError("Expected shape to be a TypeSpec or TensorShape, got %r" From 3a72de3a1b88d0c12f70713675bc83ed8addae6d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 13:49:58 -0700 Subject: [PATCH 0498/3053] Ruy: Move ARM packing code into separate file. PiperOrigin-RevId: 259809541 --- tensorflow/lite/experimental/ruy/BUILD | 2 +- tensorflow/lite/experimental/ruy/{pack.cc => pack_arm.cc} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename tensorflow/lite/experimental/ruy/{pack.cc => pack_arm.cc} (100%) diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD index 6c75783f2ce..60ad08bbda7 100644 --- a/tensorflow/lite/experimental/ruy/BUILD +++ b/tensorflow/lite/experimental/ruy/BUILD @@ -278,7 +278,7 @@ cc_library( cc_library( name = "pack", srcs = [ - "pack.cc", + "pack_arm.cc", ], hdrs = [ "pack.h", diff --git a/tensorflow/lite/experimental/ruy/pack.cc b/tensorflow/lite/experimental/ruy/pack_arm.cc similarity index 100% rename from tensorflow/lite/experimental/ruy/pack.cc rename to tensorflow/lite/experimental/ruy/pack_arm.cc From 5d37c2b785d6133de3d34ae708dd6ace9d445e5e Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Wed, 24 Jul 2019 13:50:36 -0700 Subject: [PATCH 0499/3053] Use the zero-copy implementation of GraphConstructor in more places. Many uses of GraphConstructor take a `const GraphDef&` to a locally-defined GraphDef that is subsequently destroyed. We can move the GraphDef into GraphConstructor to avoid copying the graph nodes repeatedly. In some cases with large GraphDefs (e.g. with large embedded constant tensors) this optimization will reduce peak memory consumption. PiperOrigin-RevId: 259809688 --- tensorflow/cc/framework/scope.cc | 2 +- .../mlir/tensorflow/translate/import_graphdef.cc | 8 ++++---- tensorflow/compiler/tf2xla/tf2xla.cc | 4 ++-- tensorflow/core/common_runtime/direct_session.cc | 6 +++--- .../core/common_runtime/graph_execution_state.cc | 4 ++-- tensorflow/core/distributed_runtime/graph_mgr.cc | 6 +++--- tensorflow/core/graph/graph_def_builder_util.cc | 2 +- tensorflow/core/grappler/grappler_item_builder.cc | 4 ++-- .../core/grappler/optimizers/function_optimizer.cc | 2 +- tensorflow/core/grappler/optimizers/meta_optimizer.cc | 11 ++++++----- .../tools/optimization/optimization_pass_runner.cc | 4 ++-- 11 files changed, 27 insertions(+), 26 deletions(-) diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc index e93ca8633e6..459149b47d1 100644 --- a/tensorflow/cc/framework/scope.cc +++ b/tensorflow/cc/framework/scope.cc @@ -318,7 +318,7 @@ Status Scope::ToGraph(Graph* g, GraphConstructorOptions opts) const { if (ok()) { GraphDef graph_def; graph()->ToGraphDef(&graph_def); - UpdateStatus(ConvertGraphDefToGraph(opts, graph_def, g)); + UpdateStatus(ConvertGraphDefToGraph(opts, std::move(graph_def), g)); } return *impl()->status_; } diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.cc index 0b9012d9df0..e334da1df36 100644 --- a/tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.cc +++ b/tensorflow/compiler/mlir/tensorflow/translate/import_graphdef.cc @@ -300,8 +300,8 @@ Status Importer::RemoveBackedges(const Graph& graph) { graph_ = absl::make_unique(graph.flib_def()); GraphConstructorOptions opts; opts.allow_internal_ops = true; - TF_RETURN_IF_ERROR( - ::tensorflow::ConvertGraphDefToGraph(opts, graph_def, graph_.get())); + TF_RETURN_IF_ERROR(::tensorflow::ConvertGraphDefToGraph( + opts, std::move(graph_def), graph_.get())); // Remove all the backedges. So the nodes can be added to the shape refiner. TF_RETURN_IF_ERROR(back_edge_helper_.Remove(graph_.get())); @@ -1394,8 +1394,8 @@ StatusOr ConvertGraphdefToMlir( if (add_default_attributes) { TF_RETURN_IF_ERROR(AddDefaultsToNodeDef(&preprocessed_graphdef)); } - TF_RETURN_IF_ERROR( - ConvertGraphDefToGraph(options, preprocessed_graphdef, &graph)); + TF_RETURN_IF_ERROR(ConvertGraphDefToGraph( + options, std::move(preprocessed_graphdef), &graph)); return ConvertGraphToMlir(graph, debug_info, graph.flib_def(), specs, context); diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc index 3e4188f3c6d..3c2b256800c 100644 --- a/tensorflow/compiler/tf2xla/tf2xla.cc +++ b/tensorflow/compiler/tf2xla/tf2xla.cc @@ -384,8 +384,8 @@ Status InitGraph(const GraphDef& graph_def, const tf2xla::Config& config, TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef( &second_copy_def, *g->op_registry(), /*node_offset=*/0)); - TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(GraphConstructorOptions(), - second_copy_def, g.get())); + TF_RETURN_IF_ERROR(ConvertGraphDefToGraph( + GraphConstructorOptions(), std::move(second_copy_def), g.get())); TF_RETURN_IF_ERROR(RewriteAndPruneGraph(g.get(), config, feed_remapping)); // Functionalize control flow. diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc index 3661367c708..c764a587757 100644 --- a/tensorflow/core/common_runtime/direct_session.cc +++ b/tensorflow/core/common_runtime/direct_session.cc @@ -1614,15 +1614,15 @@ Status DirectSession::CreateGraphs( } } - for (const auto& partition : partitions) { + for (auto& partition : partitions) { std::unique_ptr device_graph( new Graph(client_graph->flib_def.get())); GraphConstructorOptions device_opts; // There are internal operations (e.g., send/recv) that we now allow. device_opts.allow_internal_ops = true; device_opts.expect_device_spec = true; - TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(device_opts, partition.second, - device_graph.get())); + TF_RETURN_IF_ERROR(ConvertGraphDefToGraph( + device_opts, std::move(partition.second), device_graph.get())); outputs->emplace(partition.first, std::move(device_graph)); } diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc index 49071833f24..7468d6bc72a 100644 --- a/tensorflow/core/common_runtime/graph_execution_state.cc +++ b/tensorflow/core/common_runtime/graph_execution_state.cc @@ -757,8 +757,8 @@ Status GraphExecutionState::OptimizeGraph( GraphConstructorOptions opts; opts.allow_internal_ops = true; - TF_RETURN_IF_ERROR( - ConvertGraphDefToGraph(opts, new_graph, optimized_graph->get())); + TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, std::move(new_graph), + optimized_graph->get())); // The graph conversion sets the requested device names but not the // assigned device names. However, since at this point the graph is placed // TF expects an assigned device name for every node. Therefore we copy diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc index 81d6412e1bf..5d06bf9a75b 100644 --- a/tensorflow/core/distributed_runtime/graph_mgr.cc +++ b/tensorflow/core/distributed_runtime/graph_mgr.cc @@ -179,14 +179,14 @@ Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef, } std::unordered_map> partition_graphs; - for (const auto& partition : partitions) { + for (auto& partition : partitions) { std::unique_ptr device_graph(new Graph(OpRegistry::Global())); GraphConstructorOptions device_opts; // There are internal operations (e.g., send/recv) that we now allow. device_opts.allow_internal_ops = true; device_opts.expect_device_spec = true; - TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(device_opts, partition.second, - device_graph.get())); + TF_RETURN_IF_ERROR(ConvertGraphDefToGraph( + device_opts, std::move(partition.second), device_graph.get())); partition_graphs.emplace(partition.first, std::move(device_graph)); } diff --git a/tensorflow/core/graph/graph_def_builder_util.cc b/tensorflow/core/graph/graph_def_builder_util.cc index 102c72185f7..3ca9f8a21ff 100644 --- a/tensorflow/core/graph/graph_def_builder_util.cc +++ b/tensorflow/core/graph/graph_def_builder_util.cc @@ -22,7 +22,7 @@ Status GraphDefBuilderToGraph(const GraphDefBuilder& builder, Graph* graph) { GraphDef graph_def; TF_RETURN_IF_ERROR(builder.ToGraphDef(&graph_def)); GraphConstructorOptions opts; - return ConvertGraphDefToGraph(opts, graph_def, graph); + return ConvertGraphDefToGraph(opts, std::move(graph_def), graph); } } // namespace tensorflow diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc index 9790915eb96..6d49b2f29d0 100644 --- a/tensorflow/core/grappler/grappler_item_builder.cc +++ b/tensorflow/core/grappler/grappler_item_builder.cc @@ -267,8 +267,8 @@ Status RuntimeGraphOptimizer(const GraphDef& graph_def_arg, graph_ctor_opts.expect_device_spec = false; std::unique_ptr graphptr(new Graph(function_library)); - TF_RETURN_IF_ERROR( - ConvertGraphDefToGraph(graph_ctor_opts, graph_def, graphptr.get())); + TF_RETURN_IF_ERROR(ConvertGraphDefToGraph( + graph_ctor_opts, std::move(graph_def), graphptr.get())); // Optimize the graph. ::tensorflow::GraphOptimizer optimizer(*optimizer_opts); diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc index b4f5c36bb9c..ca8f7a2e05f 100644 --- a/tensorflow/core/grappler/optimizers/function_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc @@ -784,7 +784,7 @@ constexpr const char* const kLowerAsMultiDeviceFunctionAttr = using KeepCallerNode = InlineFunctionBodyOptions::KeepCallerNode; using OutputControlSource = InlineFunctionBodyOptions::OutputControlSource; -// Checks if boolean attribute is defined and it's value is 'true'. +// Checks if boolean attribute is defined and its value is 'true'. bool CheckBoolAttr(const Node* n, absl::string_view attr_name) { bool match; Status s = GetNodeAttr(n->attrs(), attr_name, &match); diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc index 7f1302d6b09..00164c52bd8 100644 --- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc @@ -802,8 +802,6 @@ Status OptimizeGraph( std::unique_ptr optimized_graph( new tensorflow::Graph(OpRegistry::Global())); - TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(GraphConstructorOptions(), - out_graph, optimized_graph.get())); // Copy optimized functions back to the overlay lib. if (flib) { @@ -817,25 +815,28 @@ Status OptimizeGraph( } } - *g = std::move(optimized_graph); + TF_RETURN_IF_ERROR(ConvertGraphDefToGraph( + GraphConstructorOptions(), std::move(out_graph), optimized_graph.get())); // The graph conversion sets the requested device names but not the // assigned device names. However, since at this point the graph is // placed TF expects an assigned device name for every node. Therefore // we copy the requested device into the assigned device field. - for (Node* node : (*g)->nodes()) { + for (Node* node : optimized_graph->nodes()) { if (node->IsOp() && node->assigned_device_name().empty()) { if (node->requested_device().empty()) { return errors::Internal( "Either placer did not place the node or Grappler did not " "copy the assigned device. Contact Grappler team since latter " "is more likely. Node=", - node->name(), " Graph: ", (*g)->ToGraphDefDebug().DebugString()); + node->name(), + " Graph: ", optimized_graph->ToGraphDefDebug().DebugString()); } node->set_assigned_device_name(node->requested_device()); } } + *g = std::move(optimized_graph); return Status::OK(); } diff --git a/tensorflow/tools/optimization/optimization_pass_runner.cc b/tensorflow/tools/optimization/optimization_pass_runner.cc index 162d39d7aee..8cd9e32ba6f 100644 --- a/tensorflow/tools/optimization/optimization_pass_runner.cc +++ b/tensorflow/tools/optimization/optimization_pass_runner.cc @@ -111,8 +111,8 @@ Status OptimizationPassRunner::Run(absl::string_view pass_to_run, GraphConstructorOptions graph_opts; graph_opts.expect_device_spec = true; graph_opts.allow_internal_ops = true; - TF_RETURN_IF_ERROR( - ConvertGraphDefToGraph(graph_opts, input, options.graph->get())); + TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(graph_opts, std::move(input), + options.graph->get())); // Add all devices that were previously configured with AddDevice. DeviceSet device_set; From 8bac1116b7e6f018f65b39de6b1eb36513b9f6ce Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Wed, 24 Jul 2019 14:01:35 -0700 Subject: [PATCH 0500/3053] In TF_SetAttrValueProto, move the incoming AttrValue into the NodeDef being constructed. This change avoids unnecessary copy overhead for attr values, which can potentially be large TensorProto values. PiperOrigin-RevId: 259811941 --- tensorflow/c/c_api.cc | 2 +- tensorflow/core/framework/attr_value_util.cc | 5 ++--- tensorflow/core/framework/node_def_builder.cc | 18 ++++++++++++++++-- tensorflow/core/framework/node_def_builder.h | 6 ++++++ tensorflow/core/framework/node_def_util.cc | 4 ++++ tensorflow/core/framework/node_def_util.h | 1 + 6 files changed, 30 insertions(+), 6 deletions(-) diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc index 62b2504a26d..52a1a48b706 100644 --- a/tensorflow/c/c_api.cc +++ b/tensorflow/c/c_api.cc @@ -1024,7 +1024,7 @@ void TF_SetAttrValueProto(TF_OperationDescription* desc, const char* attr_name, desc->colocation_constraints.insert(location); } } else { - desc->node_builder.Attr(attr_name, attr_value); + desc->node_builder.Attr(attr_name, std::move(attr_value)); } status->status = Status::OK(); diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc index 1eafd292f0f..5d290dea9ed 100644 --- a/tensorflow/core/framework/attr_value_util.cc +++ b/tensorflow/core/framework/attr_value_util.cc @@ -129,8 +129,6 @@ bool FastAreTensorProtosEqual(const TensorProto& lhs, const TensorProto& rhs) { } using TensorProtoHasher = std::function; -using TensorProtosEquality = - std::function; uint64 AttrValueHash(const AttrValue& a, const TensorProtoHasher& tensor_hash) { if (a.has_tensor()) return tensor_hash(a.tensor()); @@ -150,8 +148,9 @@ uint64 AttrValueHash(const AttrValue& a, const TensorProtoHasher& tensor_hash) { return DeterministicProtoHash64(a); } +template bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b, - const TensorProtosEquality& tensor_equality) { + TensorProtosEquality tensor_equality) { if (a.type() != b.type()) { return false; } else if (a.type() != DT_INVALID && b.type() != DT_INVALID) { diff --git a/tensorflow/core/framework/node_def_builder.cc b/tensorflow/core/framework/node_def_builder.cc index 58f79bd3657..9011b61715e 100644 --- a/tensorflow/core/framework/node_def_builder.cc +++ b/tensorflow/core/framework/node_def_builder.cc @@ -261,19 +261,33 @@ Status NodeDefBuilder::Finalize(NodeDef* node_def, bool consume) { } } -NodeDefBuilder& NodeDefBuilder::Attr(StringPiece name, const AttrValue& value) { +bool NodeDefBuilder::AttrValueAlreadyPresent(StringPiece name, + const AttrValue& value) { if (const AttrValue* found = AttrSlice(node_def_).Find(name)) { if (!AreAttrValuesEqual(*found, value)) { errors_.push_back(strings::StrCat("Inconsistent values for attr '", name, "' ", SummarizeAttrValue(*found), " vs. ", SummarizeAttrValue(value))); } - } else { + return true; + } + return false; +} + +NodeDefBuilder& NodeDefBuilder::Attr(StringPiece name, const AttrValue& value) { + if (!AttrValueAlreadyPresent(name, value)) { AddNodeAttr(name, value, &node_def_); } return *this; } +NodeDefBuilder& NodeDefBuilder::Attr(StringPiece name, AttrValue&& value) { + if (!AttrValueAlreadyPresent(name, value)) { + AddNodeAttr(name, std::move(value), &node_def_); + } + return *this; +} + #define ATTR(T) \ NodeDefBuilder& NodeDefBuilder::Attr(StringPiece name, T value) { \ AttrValue attr_value; \ diff --git a/tensorflow/core/framework/node_def_builder.h b/tensorflow/core/framework/node_def_builder.h index 92d6399d1e2..b4509662e15 100644 --- a/tensorflow/core/framework/node_def_builder.h +++ b/tensorflow/core/framework/node_def_builder.h @@ -93,6 +93,7 @@ class NodeDefBuilder { // Sets the attr, if not already set. If already set with a different // value, an error will be returned from Finalize(). NodeDefBuilder& Attr(StringPiece name, const AttrValue& value); + NodeDefBuilder& Attr(StringPiece name, AttrValue&& value); NodeDefBuilder& Attr(StringPiece name, StringPiece value); NodeDefBuilder& Attr(StringPiece name, const char* value); NodeDefBuilder& Attr(StringPiece name, int32 value); @@ -172,6 +173,11 @@ class NodeDefBuilder { return input_arg->is_ref() ? MakeRefType(dt) : dt; } + // Returns true if an attr named `name` is already present in the node_def_. + // If such an attr is already present and `value` is not equal to the present + // value, an error is generated. + bool AttrValueAlreadyPresent(StringPiece name, const AttrValue& value); + const OpDef* op_def_; NodeDef node_def_; int inputs_specified_; diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc index a130d26504b..d3e43b0cb0f 100644 --- a/tensorflow/core/framework/node_def_util.cc +++ b/tensorflow/core/framework/node_def_util.cc @@ -753,6 +753,10 @@ void AddNodeAttr(StringPiece name, const AttrValue& value, NodeDef* node_def) { AttrValueMap::value_type(string(name), value)); } +void AddNodeAttr(StringPiece name, AttrValue&& value, NodeDef* node_def) { + (*node_def->mutable_attr())[string(name)] = std::move(value); +} + #define ADD_NODE_ATTR(T) \ void AddNodeAttr(StringPiece name, T value, NodeDef* node_def) { \ AttrValue attr_value; \ diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h index 1a089b5f638..51ec33bdac9 100644 --- a/tensorflow/core/framework/node_def_util.h +++ b/tensorflow/core/framework/node_def_util.h @@ -74,6 +74,7 @@ typedef protobuf::Map AttrValueMap; // Adds an attr with name and value to *node_def. // The type of the attr is based on the type of value. void AddNodeAttr(StringPiece name, const AttrValue& value, NodeDef* node_def); +void AddNodeAttr(StringPiece name, AttrValue&& value, NodeDef* node_def); void AddNodeAttr(StringPiece name, StringPiece value, NodeDef* node_def); void AddNodeAttr(StringPiece name, const char* value, NodeDef* node_def); void AddNodeAttr(StringPiece name, int32 value, NodeDef* node_def); From 0f08941cfbdf24474f7660cde5e880633a7e78be Mon Sep 17 00:00:00 2001 From: Thomas O'Malley Date: Wed, 24 Jul 2019 14:04:49 -0700 Subject: [PATCH 0501/3053] Support CompositeTensors in V2 single code path. PiperOrigin-RevId: 259812771 --- .../distribute/distributed_training_utils.py | 2 +- .../python/keras/engine/data_adapter.py | 12 +- .../python/keras/engine/training_eager.py | 10 +- .../python/keras/engine/training_utils.py | 3 +- .../python/keras/engine/training_v2_utils.py | 4 +- .../utils/composite_tensor_support_test.py | 110 +++++++++++------- 6 files changed, 90 insertions(+), 51 deletions(-) diff --git a/tensorflow/python/keras/distribute/distributed_training_utils.py b/tensorflow/python/keras/distribute/distributed_training_utils.py index 1f484ae7551..28489de3fc1 100644 --- a/tensorflow/python/keras/distribute/distributed_training_utils.py +++ b/tensorflow/python/keras/distribute/distributed_training_utils.py @@ -304,7 +304,7 @@ def validate_per_replica_inputs(distribution_strategy, x): """ # Convert the inputs and targets into a list of PerReplica objects. - per_replica_list = nest.flatten(x) + per_replica_list = nest.flatten(x, expand_composites=True) x_values_list = [] for x in per_replica_list: if not tensor_util.is_tensor(x): diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py index bd29560dfbe..e1c5bc6a9ea 100644 --- a/tensorflow/python/keras/engine/data_adapter.py +++ b/tensorflow/python/keras/engine/data_adapter.py @@ -27,6 +27,7 @@ import six from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import ops +from tensorflow.python.framework.ops import composite_tensor from tensorflow.python.keras.engine import training_utils from tensorflow.python.keras.utils import data_utils from tensorflow.python.util import nest @@ -170,7 +171,16 @@ class TensorLikeDataAdapter(DataAdapter): if y is not None: flat_inputs += nest.flatten(y) - return all(isinstance(v, (ops.Tensor, np.ndarray)) for v in flat_inputs) + def _is_tensor_or_composite(v): + if isinstance(v, (ops.Tensor, np.ndarray)): + return True + # Dataset inherits from CompositeTensor but shouldn't be handled here. + if (isinstance(v, composite_tensor.CompositeTensor) and + not isinstance(v, dataset_ops.DatasetV2)): + return True + return False + + return all(_is_tensor_or_composite(v) for v in flat_inputs) def __init__(self, x, y=None, sample_weights=None, batch_size=None, shuffle=False, **kwargs): diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py index a1470fe4fa8..15b5ad3061b 100644 --- a/tensorflow/python/keras/engine/training_eager.py +++ b/tensorflow/python/keras/engine/training_eager.py @@ -283,10 +283,9 @@ def train_on_batch(model, targets = training_utils.cast_if_floating_dtype(targets) else: inputs = training_utils.cast_if_floating_to_model_input_dtypes( - [ops.convert_to_tensor(val) for val in inputs], model) + inputs, model) if targets: - targets = training_utils.cast_if_floating_dtype( - [ops.convert_to_tensor(val) for val in targets]) + targets = training_utils.cast_if_floating_dtype(targets) if sample_weights: sample_weights = [ training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val)) @@ -337,10 +336,9 @@ def test_on_batch(model, targets = training_utils.cast_if_floating_dtype(targets) else: inputs = training_utils.cast_if_floating_to_model_input_dtypes( - [ops.convert_to_tensor(val) for val in inputs], model) + inputs, model) if targets: - targets = training_utils.cast_if_floating_dtype( - [ops.convert_to_tensor(val) for val in targets]) + targets = training_utils.cast_if_floating_dtype(targets) if sample_weights: sample_weights = [ training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val)) diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py index f4c2b2613c1..1aecf8cf666 100644 --- a/tensorflow/python/keras/engine/training_utils.py +++ b/tensorflow/python/keras/engine/training_utils.py @@ -1191,7 +1191,8 @@ def check_steps_argument(input_data, steps, steps_name): def cast_single_tensor(x, dtype=None): - x = ops.convert_to_tensor(x) + if isinstance(x, np.ndarray): + x = ops.convert_to_tensor(x) dtype = dtype or K.floatx() if x.dtype.is_floating: return math_ops.cast(x, dtype=dtype) diff --git a/tensorflow/python/keras/engine/training_v2_utils.py b/tensorflow/python/keras/engine/training_v2_utils.py index ec898493a25..c972a4cc9dd 100644 --- a/tensorflow/python/keras/engine/training_v2_utils.py +++ b/tensorflow/python/keras/engine/training_v2_utils.py @@ -29,6 +29,7 @@ import functools from tensorflow.python.distribute import distribution_strategy_context from tensorflow.python.eager import def_function from tensorflow.python.framework import tensor_util +from tensorflow.python.framework.ops import composite_tensor from tensorflow.python.keras import backend from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils from tensorflow.python.keras.engine import training_eager @@ -125,7 +126,8 @@ def _get_input_from_iterator(iterator): """Get elements from the iterator and verify the input shape and type.""" next_element = next(iterator) - if tensor_util.is_tensor(next_element) or isinstance(next_element, dict): + if (tensor_util.is_tensor(next_element) or + isinstance(next_element, (dict, composite_tensor.CompositeTensor))): next_element = [next_element] if len(next_element) == 1: x, = next_element diff --git a/tensorflow/python/keras/utils/composite_tensor_support_test.py b/tensorflow/python/keras/utils/composite_tensor_support_test.py index 649a1f8d409..11382e2156f 100644 --- a/tensorflow/python/keras/utils/composite_tensor_support_test.py +++ b/tensorflow/python/keras/utils/composite_tensor_support_test.py @@ -26,6 +26,7 @@ import scipy.sparse from tensorflow.python import keras from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.eager import context from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor @@ -152,6 +153,17 @@ def get_model_from_layers_with_input(layers, raise ValueError("Unknown model type {}".format(model_type)) +def get_test_mode_kwargs(): + run_eagerly = testing_utils.should_run_eagerly() + # Certain things weren't supported correctly in the old path, therefore + # with these changes, some tests now only pass in the single code path in V2. + if run_eagerly or context.executing_eagerly(): + run_distributed = True + else: + run_distributed = testing_utils.should_run_distributed() + return {"run_eagerly": run_eagerly, "run_distributed": run_distributed} + + @keras_parameterized.run_with_all_model_types @keras_parameterized.run_all_keras_modes class CompositeTensorInternalTest(keras_parameterized.TestCase): @@ -194,11 +206,7 @@ class CompositeTensorInternalTest(keras_parameterized.TestCase): input_data = np.random.rand(1024, 1) expected_data = np.concatenate((input_data * 3, input_data * .5), axis=-1) - model.compile( - loss="mse", - optimizer="adam", - run_eagerly=testing_utils.should_run_eagerly(), - run_distributed=testing_utils.should_run_distributed()) + model.compile(loss="mse", optimizer="adam", **get_test_mode_kwargs()) history = model.fit(input_data, expected_data, epochs=10, verbose=0) # If the model trained, the loss stored at history[0] should be different @@ -284,26 +292,28 @@ def get_input_name(use_dict): return "test_input_name" -def get_steps(): - # Determine the steps arg (if appropriate) - if not testing_utils.should_run_eagerly(): - # CompositeTensors in graph mode are symbolic and so require a steps arg. - return 1 +def get_kwargs(use_dataset, action="predict"): + if use_dataset or not context.executing_eagerly(): + if action == "fit": + return {"steps_per_epoch": 1} + return {"steps": 1} else: - return None + return {"batch_size": 2} def prepare_inputs(data, use_dict, use_dataset, action, input_name): input_data, expected_output = data + batch_size = input_data.shape[0] # Prepare the input data. if use_dict: input_data = {input_name: input_data} if use_dataset: if action == "predict": - input_data = dataset_ops.Dataset.from_tensors(input_data) + input_data = dataset_ops.DatasetV2.from_tensor_slices(input_data).batch( + batch_size) else: - input_data = dataset_ops.Dataset.from_tensors( - (input_data, expected_output)) + input_data = dataset_ops.DatasetV2.from_tensor_slices( + (input_data, expected_output)).batch(batch_size) expected_output = None return (input_data, expected_output) @@ -332,8 +342,12 @@ class SparseTensorInputTest(keras_parameterized.TestCase): shape=(1, None), sparse=True, name=input_name, dtype=dtypes.int32) layers = [ToDense(default_value=-1)] model = get_model_from_layers_with_input(layers, model_input=model_input) - model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"]) - steps = get_steps() + model.compile( + optimizer="sgd", + loss="mse", + metrics=["accuracy"], + **get_test_mode_kwargs()) + kwargs = get_kwargs(use_dataset, action) # Prepare the input data for data_element in data: @@ -342,15 +356,14 @@ class SparseTensorInputTest(keras_parameterized.TestCase): input_name) # Perform the action. if action == "predict": - result = model.predict(input_data, steps=steps) + result = model.predict(input_data, **kwargs) self.assertAllEqual(expected_output, result) if action == "evaluate": - result = model.evaluate(input_data, expected_output, steps=steps) + result = model.evaluate(input_data, expected_output, **kwargs) self.assertAllEqual(1.0, result[-1]) if action == "fit": # TODO(momernick): What's the best way of validating that fit happened? - _ = model.fit( - input_data, expected_output, shuffle=False, steps_per_epoch=steps) + _ = model.fit(input_data, expected_output, shuffle=False, **kwargs) @keras_parameterized.run_with_all_model_types @@ -385,7 +398,11 @@ class ScipySparseTensorInputTest(keras_parameterized.TestCase, model_input = input_layer.Input(shape=(3,), sparse=True, dtype=dtypes.int64) layers = [ToDense(default_value=-1)] model = get_model_from_layers_with_input(layers, model_input=model_input) - model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"]) + model.compile( + optimizer="sgd", + loss="mse", + metrics=["accuracy"], + run_distributed=testing_utils.should_run_distributed()) input_data = scipy.sparse.coo_matrix(([1, 2, 3], ([0, 1, 1], [0, 0, 1])), shape=[2, 3]) @@ -443,7 +460,11 @@ class ScipySparseTensorInputTest(keras_parameterized.TestCase, shape=(3,), sparse=True, name=input_name, dtype=dtypes.int64) layers = [ToDense(default_value=-1)] model = get_model_from_layers_with_input(layers, model_input=model_input) - model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"]) + model.compile( + optimizer="sgd", + loss="mse", + metrics=["accuracy"], + run_distributed=testing_utils.should_run_distributed()) input_data = { input_name: @@ -484,7 +505,11 @@ class RaggedTensorInputTest(keras_parameterized.TestCase, shape=(None, None), ragged=True, name=input_name, dtype=dtypes.int32) layers = [ToDense(default_value=-1)] model = get_model_from_layers_with_input(layers, model_input=model_input) - model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"]) + model.compile( + optimizer="sgd", + loss="mse", + metrics=["accuracy"], + **get_test_mode_kwargs()) # Prepare the input data for data_element in data: @@ -524,7 +549,11 @@ class RaggedTensorInputValidationTest(keras_parameterized.TestCase, shape=input_shape, ragged=True, name=input_name, dtype=dtypes.int32) layers = [ToDense(default_value=-1)] model = get_model_from_layers_with_input(layers, model_input=model_input) - model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"]) + model.compile( + optimizer="sgd", + loss="mse", + metrics=["accuracy"], + **get_test_mode_kwargs()) for data_element in data: input_data, expected_output = prepare_inputs( @@ -549,11 +578,12 @@ class RaggedTensorInputValidationTest(keras_parameterized.TestCase, shape=input_shape, ragged=True, name=input_name, dtype=dtypes.int32) layers = [ToDense(default_value=-1)] model = get_model_from_layers_with_input(layers, model_input=model_input) - model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"]) - - # The input is a symbolic tensor in non-Eager modes, so 'steps' is required - # for that case only. - steps = get_steps() + model.compile( + optimizer="sgd", + loss="mse", + metrics=["accuracy"], + **get_test_mode_kwargs()) + kwargs = get_kwargs(use_dataset) for data_element in data: input_data, expected_output = prepare_inputs( @@ -562,7 +592,7 @@ class RaggedTensorInputValidationTest(keras_parameterized.TestCase, use_dataset, action="predict", input_name=input_name) - result = model.predict(input_data, steps=steps) + result = model.predict(input_data, **kwargs) self.assertAllEqual(expected_output, result) def test_ragged_tensor_input_with_wrong_ragged_rank_fails( @@ -577,7 +607,11 @@ class RaggedTensorInputValidationTest(keras_parameterized.TestCase, shape=input_shape, ragged=True, name=input_name, dtype=dtypes.int32) layers = [ToDense(default_value=-1)] model = get_model_from_layers_with_input(layers, model_input=model_input) - model.compile(optimizer="sgd", loss="mse", metrics=["accuracy"]) + model.compile( + optimizer="sgd", + loss="mse", + metrics=["accuracy"], + **get_test_mode_kwargs()) # Define some input data with the wrong ragged rank for data_element in data: @@ -618,15 +652,9 @@ class SparseTensorInputValidationTest(keras_parameterized.TestCase): # Define some input data. input_data = sparse_tensor.SparseTensor([[0, 0, 0], [1, 0, 0], [1, 0, 1]], [1, 2, 3], [2, 1, 3]) - if not testing_utils.should_run_eagerly(): - # This ragged tensor is actually a standard tensor (as it has no ragged - # dimensions). Because of this, graph mode models will expect a steps - # arg to be passed (as SparseTensors in graph mode are symbolic). - steps = 1 - else: - steps = None + kwargs = get_kwargs(use_dataset=False) with self.assertRaisesRegex(ValueError, ".*got array with shape.*"): - _ = model.predict(input_data, steps=steps) + _ = model.predict(input_data, **kwargs) def test_ragged_tensor_input_with_wrong_value_shape(self): # Create a model that accepts a ragged input and converts it to dense. @@ -652,14 +680,14 @@ class UndefinedCompositeTensorInputsTest(keras_parameterized.TestCase): # back to a dense tensor. layers = [ToDense(default_value=-1)] model = testing_utils.get_model_from_layers(layers) - steps = get_steps() # Define some input data. input_data = sparse_tensor.SparseTensor([[0, 0], [1, 0], [1, 1]], [1, 2, 3], [2, 3]) + kwargs = get_kwargs(False) with self.assertRaisesRegex( ValueError, ".*All SparseTensor and RaggedTensor inputs .*"): - _ = model.predict(input_data, steps=steps) + _ = model.predict(input_data, **kwargs) def test_subclass_implicit_sparse_scipy_inputs_fails(self): # Create a model that accepts a sparse input and converts the sparse tensor From 02c9ee21b3f14c4a19e326ee3197908a3d65cb9a Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 14:16:42 -0700 Subject: [PATCH 0502/3053] ICU 64+ no longer uses U_HAVE_STD_ATOMICS PiperOrigin-RevId: 259815120 --- third_party/icu/BUILD.bazel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/icu/BUILD.bazel b/third_party/icu/BUILD.bazel index 36d6b9006b9..69496567ebd 100644 --- a/third_party/icu/BUILD.bazel +++ b/third_party/icu/BUILD.bazel @@ -44,7 +44,7 @@ cc_library( ]), copts = [ "-DU_COMMON_IMPLEMENTATION", - "-DU_HAVE_STD_ATOMICS", + "-DU_HAVE_STD_ATOMICS", # TODO(gunan): Remove when TF is on ICU 64+. ] + select({ ":android": [ "-fdata-sections", From b049a48a621a85c6a73f41e4fe2592178185b267 Mon Sep 17 00:00:00 2001 From: Pooya Davoodi Date: Mon, 8 Jul 2019 16:56:41 -0700 Subject: [PATCH 0503/3053] Validate max_batch_size only in static mode Do not change max_batch_size under the hood, let user make that change. --- .../convert/trt_optimization_pass.cc | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc index 6af483d37cf..20e84f7a5a8 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/trt_optimization_pass.cc @@ -193,32 +193,32 @@ Status TRTOptimizationPass::Optimize(grappler::Cluster* cluster, LOG(INFO) << CurrentStackTrace(); PrintDebugInfo(cluster, item); } - int max_dim = -1; - if (!item.feed.empty()) { - for (const auto& f : item.feed) { - const auto& shape = f.second.shape(); - if (shape.dims() > 0) { - if (shape.dim_size(0) > max_dim) max_dim = shape.dim_size(0); + if (!is_dynamic_op_) { + int max_batch_dim = -1; + if (!item.feed.empty()) { + for (const auto& f : item.feed) { + const auto& shape = f.second.shape(); + if (shape.dims() > 0) { + if (shape.dim_size(0) > max_batch_dim) max_batch_dim = shape.dim_size(0); + VLOG(2) << "Setting max_batch_dim to " << max_batch_dim + << " using batch dimension of " << f.first + << " with shape " << shape; + } } } - } - if (maximum_batch_size_ < 0) { // automatic batch size from input - if (max_dim > 0) { - maximum_batch_size_ = max_dim; - VLOG(1) << "Setting maximum batch size to " << max_dim; - } else { - maximum_batch_size_ = 128; - LOG(WARNING) << "Maximum batch size is not set" - " and can't be deduced from inputs setting it to" - << maximum_batch_size_ - << ". Suggest configuring it from configuration parameters"; - } - } else { - if (max_dim > maximum_batch_size_) { - LOG(WARNING) << "Configured batch size " << maximum_batch_size_ - << " is less than input batch size " << max_dim - << " adjusting maximum batch size to match input batch size"; + if (max_batch_dim > maximum_batch_size_) { + return errors::InvalidArgument( + "Specified max_batch_size=", maximum_batch_size_, + " is less than maximum batch dimension of inputs (", + max_batch_dim, "). ", + "To continue, set max_batch_size to >= ", max_batch_dim); } + else if (max_batch_dim < maximum_batch_size_) { + LOG(INFO) << "Specified max_batch_size=" << maximum_batch_size_ + << " is larger than maximum batch dimension of inputs (" + << max_batch_dim << "). " + << "This can result in poor performance."; + } } grappler::GraphProperties static_graph_properties(item); TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true)); From 5abd6055e17d274f90351493c7ff3c2d176cd2a0 Mon Sep 17 00:00:00 2001 From: "Xiaoming (Jason) Cui" Date: Wed, 24 Jul 2019 12:06:11 -0700 Subject: [PATCH 0504/3053] [INTEL MKL] changed the function name matmul_prefix() to matmul_op_name() and also changed the function to return op name directly instead of name prefix, with this change, we can remove many string concatation in the test to improve the performance --- .../python/debug/cli/analyzer_cli_test.py | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py index 1ce8745b245..9562021b200 100644 --- a/tensorflow/python/debug/cli/analyzer_cli_test.py +++ b/tensorflow/python/debug/cli/analyzer_cli_test.py @@ -46,9 +46,9 @@ from tensorflow.python.platform import googletest from tensorflow.python.platform import test from tensorflow.python.util import tf_inspect -def matmul_prefix(): - prefix = "_Mkl" if test_util.IsMklEnabled() else "" - return prefix +def matmul_op_name(): + op_name = "_MklMatMul" if test_util.IsMklEnabled() else "MatMul" + return op_name def _cli_config_from_temp_file(): return cli_config.CLIConfig( @@ -677,7 +677,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/add:0" ], ["VariableV2", "VariableV2", "Identity", "Identity", - matmul_prefix() + "MatMul", "Add"]) + matmul_op_name(), "Add"]) # Check the main menu. check_main_menu(self, out, list_tensors_enabled=False) @@ -693,7 +693,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], ["VariableV2", "VariableV2", "Identity", "Identity", - matmul_prefix() + "MatMul", "Add"], + matmul_op_name(), "Add"], sort_by="timestamp", reverse=True) check_main_menu(self, out, list_tensors_enabled=False) @@ -708,7 +708,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], ["VariableV2", "VariableV2", "Identity", "Identity", - matmul_prefix() + "MatMul", "Add"], + matmul_op_name(), "Add"], sort_by="dump_size") check_main_menu(self, out, list_tensors_enabled=False) @@ -722,7 +722,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], ["VariableV2", "VariableV2", "Identity", "Identity", - matmul_prefix() + "MatMul", "Add"], + matmul_op_name(), "Add"], sort_by="dump_size", reverse=True) check_main_menu(self, out, list_tensors_enabled=False) @@ -743,7 +743,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], ["VariableV2", "VariableV2", "Identity", "Identity", - matmul_prefix() + "MatMul", "Add"], + matmul_op_name(), "Add"], sort_by="op_type", reverse=False) check_main_menu(self, out, list_tensors_enabled=False) @@ -759,7 +759,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], ["VariableV2", "VariableV2", "Identity", "Identity", - matmul_prefix() + "MatMul", "Add"], + matmul_op_name(), "Add"], sort_by="op_type", reverse=True) check_main_menu(self, out, list_tensors_enabled=False) @@ -775,7 +775,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], ["VariableV2", "VariableV2", "Identity", "Identity", - matmul_prefix() + "MatMul", "Add"], + matmul_op_name(), "Add"], sort_by="tensor_name", reverse=False) check_main_menu(self, out, list_tensors_enabled=False) @@ -791,7 +791,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], ["VariableV2", "VariableV2", "Identity", "Identity", - matmul_prefix() + "MatMul", "Add"], + matmul_op_name(), "Add"], sort_by="tensor_name", reverse=True) check_main_menu(self, out, list_tensors_enabled=False) @@ -819,12 +819,12 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): op_type_regex="Identity") out = self._registry.dispatch_command( - "list_tensors", ["-t", "(Add|" + matmul_prefix() + "MatMul)"]) + "list_tensors", ["-t", "(Add|" + matmul_op_name() + ")"]) assert_listed_tensors( self, out, ["simple_mul_add/add:0", "simple_mul_add/matmul:0"], - ["Add", matmul_prefix() + "MatMul"], - op_type_regex=("(Add|" + matmul_prefix() + "MatMul)")) + ["Add", matmul_op_name()], + op_type_regex=("(Add|" + matmul_op_name() + ")")) check_main_menu(self, out, list_tensors_enabled=False) def testListTensorFilterByNodeNameRegexAndOpTypeRegex(self): @@ -860,7 +860,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): assert_listed_tensors( self, out, ["simple_mul_add/matmul:0", "simple_mul_add/add:0"], - [matmul_prefix() + "MatMul", "Add"], tensor_filter_name="is_2x1_vector") + [matmul_op_name(), "Add"], tensor_filter_name="is_2x1_vector") check_main_menu(self, out, list_tensors_enabled=False) @@ -901,7 +901,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): recipients = [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")] assert_node_attribute_lines(self, out, node_name, - matmul_prefix() + "MatMul", + matmul_op_name(), self._main_device, [("Identity", "simple_mul_add/u/read"), ("Identity", "simple_mul_add/v/read")], [], @@ -933,7 +933,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): self, out, node_name, - matmul_prefix() + "MatMul", + matmul_op_name(), self._main_device, [("Identity", "simple_mul_add/u/read"), ("Identity", "simple_mul_add/v/read")], [], [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [], @@ -954,7 +954,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): self, out, node_name, - matmul_prefix() + "MatMul", + matmul_op_name(), self._main_device, [("Identity", "simple_mul_add/u/read"), ("Identity", "simple_mul_add/v/read")], [], [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [], @@ -980,7 +980,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): self, out, node_name, - matmul_prefix() + "MatMul", + matmul_op_name(), self._main_device, [("Identity", "simple_mul_add/u/read"), ("Identity", "simple_mul_add/v/read")], [], [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [], @@ -1003,7 +1003,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): self, out, node_name, - matmul_prefix() + "MatMul", + matmul_op_name(), self._main_device, [("Identity", "simple_mul_add/u/read"), ("Identity", "simple_mul_add/v/read")], [], [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [], @@ -1024,7 +1024,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): assert_node_attribute_lines(self, out, node_name, "Identity", self._main_device, [("VariableV2", "simple_mul_add/u")], [], - [(matmul_prefix() + "MatMul", + [(matmul_op_name(), "simple_mul_add/matmul")], []) check_main_menu( self, From 6f067fd4434d4502de3619ee2d71f5830cf613bc Mon Sep 17 00:00:00 2001 From: "Xiaoming (Jason) Cui" Date: Wed, 24 Jul 2019 14:48:50 -0700 Subject: [PATCH 0505/3053] [INTEL MKL] Added description of function matmul_op_name() newly added in the test tensorflow/python/debug/cli/analyzer_cli_test.py, and a few minor changes to optimize the function --- tensorflow/python/debug/cli/analyzer_cli_test.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py index 9562021b200..982fccfd58c 100644 --- a/tensorflow/python/debug/cli/analyzer_cli_test.py +++ b/tensorflow/python/debug/cli/analyzer_cli_test.py @@ -46,9 +46,19 @@ from tensorflow.python.platform import googletest from tensorflow.python.platform import test from tensorflow.python.util import tf_inspect +# There are two types MKL supported operators. One type operators whose kernels +# understand MKL layout in input tensors, # (e.g., MklConv2D, etc.) we +# registered them with 'MklLayoutDependentOp' label. The other operators whose +# kernels don't understand input tensors with MKL layout. # (e.g., MklMatMul, +# MklTranspose), we registered them with 'MklNameChangeOp' label. With those +# operators registered as 'MklNameChangeOp' operators, we go through a name +# change during graph rewrite pass, and we changed the name of operators by +# adding "Mkl" before their original name. In this test, only MatMul is +# affected. We add this function to automatically change the operator's name +# 'MatMul' to 'MklMatMul' when the test is running with MKL enabled TensorFlow, +# so that the test can pass. def matmul_op_name(): - op_name = "_MklMatMul" if test_util.IsMklEnabled() else "MatMul" - return op_name + return "_MklMatMul" if test_util.IsMklEnabled() else "MatMul" def _cli_config_from_temp_file(): return cli_config.CLIConfig( From 3ed5bc4dd968bfd0c44982df10ba0a69e4feae12 Mon Sep 17 00:00:00 2001 From: jerryyin Date: Wed, 24 Jul 2019 21:56:25 +0000 Subject: [PATCH 0506/3053] [ROCm] Adding support to rnn ops --- tensorflow/core/kernels/rnn/BUILD | 7 ++++++- tensorflow/core/kernels/rnn/blas_gemm.cc | 10 +++++----- tensorflow/core/kernels/rnn/gru_ops.cc | 4 ++-- tensorflow/core/kernels/rnn/gru_ops_gpu.cu.cc | 4 ++-- tensorflow/core/kernels/rnn/lstm_ops.cc | 20 +++++++++---------- .../core/kernels/rnn/lstm_ops_gpu.cu.cc | 4 ++-- 6 files changed, 27 insertions(+), 22 deletions(-) diff --git a/tensorflow/core/kernels/rnn/BUILD b/tensorflow/core/kernels/rnn/BUILD index 2975e8bc02c..4ec405b5bff 100644 --- a/tensorflow/core/kernels/rnn/BUILD +++ b/tensorflow/core/kernels/rnn/BUILD @@ -10,6 +10,10 @@ load( "//tensorflow/core:platform/default/cuda_build_defs.bzl", "if_cuda_is_configured", ) +load( + "@local_config_rocm//rocm:build_defs.bzl", + "if_rocm_is_configured", +) package( default_visibility = ["//tensorflow:internal"], @@ -19,7 +23,8 @@ licenses(["notice"]) # Apache 2.0 tf_gpu_library( name = "blas_gemm", - srcs = if_cuda_is_configured(["blas_gemm.cc"]), + srcs = if_cuda_is_configured(["blas_gemm.cc"]) + + if_rocm_is_configured(["blas_gemm.cc"]), hdrs = ["blas_gemm.h"], deps = [ "//tensorflow/core:framework", diff --git a/tensorflow/core/kernels/rnn/blas_gemm.cc b/tensorflow/core/kernels/rnn/blas_gemm.cc index e9da5f0aebb..d0f25dd73bb 100644 --- a/tensorflow/core/kernels/rnn/blas_gemm.cc +++ b/tensorflow/core/kernels/rnn/blas_gemm.cc @@ -15,15 +15,15 @@ limitations under the License. #define EIGEN_USE_THREADS -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #include "tensorflow/core/platform/stream_executor.h" -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/kernels/rnn/blas_gemm.h" namespace tensorflow { -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM namespace { template se::DeviceMemory AsDeviceMemory(const T* cuda_memory) { @@ -32,7 +32,7 @@ se::DeviceMemory AsDeviceMemory(const T* cuda_memory) { return typed; } } // namespace -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM namespace functor { template @@ -41,7 +41,7 @@ void TensorCuBlasGemm::operator()(OpKernelContext* ctx, bool transa, float alpha, const T* a, int lda, const T* b, int ldb, float beta, T* c, int ldc) { -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose, se::blas::Transpose::kTranspose}; diff --git a/tensorflow/core/kernels/rnn/gru_ops.cc b/tensorflow/core/kernels/rnn/gru_ops.cc index 27e1698ece5..fbeaf3c7810 100644 --- a/tensorflow/core/kernels/rnn/gru_ops.cc +++ b/tensorflow/core/kernels/rnn/gru_ops.cc @@ -380,7 +380,7 @@ REGISTER_KERNEL(float); #undef REGISTER_KERNEL // GPU support. -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU // Forward declare the GPU Fprop functor. @@ -445,6 +445,6 @@ DECLARE_GPU_SPEC(float); REGISTER_GPU_KERNEL(float); #undef REGISTER_GPU_KERNEL -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } // end namespace tensorflow diff --git a/tensorflow/core/kernels/rnn/gru_ops_gpu.cu.cc b/tensorflow/core/kernels/rnn/gru_ops_gpu.cu.cc index ca4c233388d..d72a3b1efef 100644 --- a/tensorflow/core/kernels/rnn/gru_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/rnn/gru_ops_gpu.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include "tensorflow/core/kernels/rnn/gru_ops.h" @@ -32,4 +32,4 @@ DEFINE_GPU_SPECS(float); } // end namespace functor } // end namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/rnn/lstm_ops.cc b/tensorflow/core/kernels/rnn/lstm_ops.cc index b1bf1cae0ce..7e067b31ecf 100644 --- a/tensorflow/core/kernels/rnn/lstm_ops.cc +++ b/tensorflow/core/kernels/rnn/lstm_ops.cc @@ -15,9 +15,9 @@ limitations under the License. #define EIGEN_USE_THREADS -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM #include "tensorflow/core/kernels/rnn/lstm_ops.h" @@ -378,7 +378,7 @@ REGISTER_KERNEL(float); REGISTER_KERNEL(Eigen::half); #undef REGISTER_KERNEL -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM namespace functor { #define DECLARE_GPU_SPEC(T) \ template <> \ @@ -412,7 +412,7 @@ REGISTER_GPU_KERNEL(float); REGISTER_GPU_KERNEL(Eigen::half); // REGISTER_GPU_KERNEL(double); #undef REGISTER_GPU_KERNEL -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM template class LSTMBlockCellGradOp : public OpKernel { @@ -665,7 +665,7 @@ REGISTER_KERNEL(float); REGISTER_KERNEL(Eigen::half); #undef REGISTER_KERNEL -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM namespace functor { #define DECLARE_GPU_SPEC(T) \ template <> \ @@ -707,7 +707,7 @@ REGISTER_GPU_KERNEL(float); REGISTER_GPU_KERNEL(Eigen::half); // REGISTER_GPU_KERNEL(double); #undef REGISTER_GPU_KERNEL -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM namespace { @@ -1012,7 +1012,7 @@ REGISTER_KERNEL(float); REGISTER_KERNEL(Eigen::half); #undef REGISTER_KERNEL -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM namespace functor { #define DECLARE_GPU_SPEC(T) \ template <> \ @@ -1044,7 +1044,7 @@ REGISTER_GPU_KERNEL(float); REGISTER_GPU_KERNEL(Eigen::half); // REGISTER_GPU_KERNEL(double); #undef REGISTER_GPU_KERNEL -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM template class BlockLSTMGradOp : public OpKernel { @@ -1287,7 +1287,7 @@ REGISTER_KERNEL(float); REGISTER_KERNEL(Eigen::half); #undef REGISTER_KERNEL -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM namespace functor { #define DECLARE_GPU_SPEC(T) \ template <> \ @@ -1355,6 +1355,6 @@ REGISTER_GPU_KERNEL(float); REGISTER_GPU_KERNEL(Eigen::half); // REGISTER_GPU_KERNEL(double); #undef REGISTER_GPU_KERNEL -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } // end namespace tensorflow diff --git a/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc b/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc index 4101ee8ed2f..256591a7c62 100644 --- a/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/rnn/lstm_ops_gpu.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU @@ -460,4 +460,4 @@ DEFINE_GPU_SPECS(Eigen::half); } // end namespace functor } // end namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM From 59dae582c7af70aff1556505fcd9d42c94bc9f2a Mon Sep 17 00:00:00 2001 From: Pooya Davoodi Date: Wed, 24 Jul 2019 15:00:18 -0700 Subject: [PATCH 0507/3053] Add support for TensorRT precision mode in lowercase --- tensorflow/python/compiler/tensorrt/trt_convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py index b11938aecc3..b9b6ca91587 100644 --- a/tensorflow/python/compiler/tensorrt/trt_convert.py +++ b/tensorflow/python/compiler/tensorrt/trt_convert.py @@ -94,8 +94,8 @@ class TrtPrecisionMode(object): @staticmethod def supported_precision_modes(): - return [TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8] - + precisions = [TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8] + return precisions + [p.lower() for p in precisions] # Use a large enough number as the default max_workspace_size for TRT engines, # so it can produce reasonable performance results with the default. From aa8f7194cfc7d9b0b6f9df061fb31cdcb656e0f0 Mon Sep 17 00:00:00 2001 From: Edward Loper Date: Wed, 24 Jul 2019 14:44:44 -0700 Subject: [PATCH 0508/3053] Add tf.ragged.stack, which is similar to tf.stack, but the inputs can have different shapes; and the result is a RaggedTensor. PiperOrigin-RevId: 259820570 --- .../python/ops/ragged/ragged_concat_ops.py | 50 ++++++++++--------- .../api/golden/v1/tensorflow.ragged.pbtxt | 4 ++ .../api/golden/v2/tensorflow.ragged.pbtxt | 4 ++ 3 files changed, 34 insertions(+), 24 deletions(-) diff --git a/tensorflow/python/ops/ragged/ragged_concat_ops.py b/tensorflow/python/ops/ragged/ragged_concat_ops.py index 30fe7530781..1372db07abc 100644 --- a/tensorflow/python/ops/ragged/ragged_concat_ops.py +++ b/tensorflow/python/ops/ragged/ragged_concat_ops.py @@ -27,6 +27,7 @@ from tensorflow.python.ops.ragged import ragged_array_ops from tensorflow.python.ops.ragged import ragged_gather_ops from tensorflow.python.ops.ragged import ragged_tensor from tensorflow.python.ops.ragged import ragged_util +from tensorflow.python.util.tf_export import tf_export def concat(values, axis, name=None): @@ -70,40 +71,41 @@ def concat(values, axis, name=None): return _ragged_stack_concat_helper(values, axis, stack_values=False) +@tf_export('ragged.stack') def stack(values, axis=0, name=None): - """Stacks potentially ragged tensors along one dimension. + """Stacks a list of rank-`R` tensors into one rank-`(R+1)` `RaggedTensor`. - Given a list of tensors with the same rank `K` (`K >= axis`), returns a - rank-`K+1` `RaggedTensor` `result` such that `result[i0...iaxis]` is the - list `[rt[i0...iaxis] for rt in values]`. - - Args: - values: A list of potentially ragged tensors. May not be empty. All - `values` must have the same rank and the same dtype; but unlike - `tf.concat`, they can have arbitrary shapes. - axis: A python integer, indicating the dimension along which to stack. - (Note: Unlike `tf.stack`, the `axis` parameter must be statically known.) - Negative values are supported only if the rank of at least one - `values` value is statically known. - name: A name prefix for the returned tensor (optional). - - Returns: - A `RaggedTensor` with rank `K+1`. - `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`. - - Raises: - ValueError: If `values` is empty, if `axis` is out of bounds or if - the input tensors have different ranks. + Given a list of tensors or ragged tensors with the same rank `R` + (`R >= axis`), returns a rank-`R+1` `RaggedTensor` `result` such that + `result[i0...iaxis]` is `[value[i0...iaxis] for value in values]`. #### Example: ```python >>> t1 = tf.ragged.constant([[1, 2], [3, 4, 5]]) >>> t2 = tf.ragged.constant([[6], [7, 8, 9]]) - >>> ragged.stack([t1, t2], axis=0) + >>> tf.ragged.stack([t1, t2], axis=0) [[[1, 2], [3, 4, 5]], [[6], [7, 9, 0]]] - >>> ragged.stack([t1, t2], axis=1) + >>> tf.ragged.stack([t1, t2], axis=1) [[[1, 2], [6]], [[3, 4, 5], [7, 8, 9]]] ``` + + Args: + values: A list of `tf.Tensor` or `tf.RaggedTensor`. May not be empty. All + `values` must have the same rank and the same dtype; but unlike + `tf.stack`, they can have arbitrary dimension sizes. + axis: A python integer, indicating the dimension along which to stack. + (Note: Unlike `tf.stack`, the `axis` parameter must be statically known.) + Negative values are supported only if the rank of at least one + `values` value is statically known. + name: A name prefix for the returned tensor (optional). + + Returns: + A `RaggedTensor` with rank `R+1`. + `result.ragged_rank=1+max(axis, max(rt.ragged_rank for rt in values]))`. + + Raises: + ValueError: If `values` is empty, if `axis` is out of bounds or if + the input tensors have different ranks. """ if not isinstance(values, (list, tuple)): values = [values] diff --git a/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt index 6b07759af97..55ad2621d80 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt @@ -36,4 +36,8 @@ tf_module { name: "segment_ids_to_row_splits" argspec: "args=[\'segment_ids\', \'num_segments\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], " } + member_method { + name: "stack" + argspec: "args=[\'values\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], " + } } diff --git a/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt index d3f70f130f7..2420aa902e0 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt @@ -24,4 +24,8 @@ tf_module { name: "segment_ids_to_row_splits" argspec: "args=[\'segment_ids\', \'num_segments\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], " } + member_method { + name: "stack" + argspec: "args=[\'values\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], " + } } From c5f031ce3406e2a2422a3bd3cdc86d21f32f7383 Mon Sep 17 00:00:00 2001 From: Pavithra Vijay Date: Wed, 24 Jul 2019 14:44:45 -0700 Subject: [PATCH 0509/3053] Handle partial sample weight use case in the single execution path data adapter. PiperOrigin-RevId: 259820573 --- tensorflow/python/keras/engine/data_adapter.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py index e1c5bc6a9ea..a25ffe906ce 100644 --- a/tensorflow/python/keras/engine/data_adapter.py +++ b/tensorflow/python/keras/engine/data_adapter.py @@ -30,6 +30,7 @@ from tensorflow.python.framework import ops from tensorflow.python.framework.ops import composite_tensor from tensorflow.python.keras.engine import training_utils from tensorflow.python.keras.utils import data_utils +from tensorflow.python.ops import array_ops from tensorflow.python.util import nest from tensorflow.python.util import tf_inspect @@ -188,6 +189,15 @@ class TensorLikeDataAdapter(DataAdapter): x = _process_numpy_inputs(x) y = _process_numpy_inputs(y) sample_weights = _process_numpy_inputs(sample_weights) + + # If sample_weights are not specified for an output use 1.0 as weights. + if sample_weights is not None and None in sample_weights: + weight = next(s for s in sample_weights if s is not None) + sample_weights = training_utils.list_to_tuple([ + array_ops.ones((weight.shape[0],)) if sw is None else sw + for sw in sample_weights + ]) + if y is not None and sample_weights is not None: inputs = (x, y, sample_weights) elif y is not None: From 450d077d77a89385721ff54828537009a567e447 Mon Sep 17 00:00:00 2001 From: Feng Liu Date: Wed, 24 Jul 2019 14:59:53 -0700 Subject: [PATCH 0510/3053] Fix type constraints in tfl fully-connected and logistic ops The bias of the fully-connected op needs to be quantized to 32 bits integer, so the type constraint of this operand should be QI32 and QUI32. The input and output of logistic op can also be quantized type. PiperOrigin-RevId: 259823593 --- tensorflow/compiler/mlir/lite/ir/tfl_ops.td | 43 ++++++++++--------- tensorflow/compiler/mlir/lite/tests/ops.mlir | 2 +- .../compiler/mlir/lite/tests/quantize.mlir | 29 +++++++++++++ 3 files changed, 53 insertions(+), 21 deletions(-) diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td index 298f962d096..6e30347bbcf 100644 --- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td +++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td @@ -134,9 +134,12 @@ class TFL_Int8UniformQuantizedType : TFL_QuantizedType<"Uniform", [8, zero_pt, smantissa, sexp, -128, 127], 1>; -// 8-bits quantized types. The definitions can be used to specify tensor types. +// General uniform quantized types. The definitions can be used to specify +// operand's tensor types. def TFL_QUI8 : TFL_QuantizedType<"Uniform", [8], 0>; def TFL_QI8 : TFL_QuantizedType<"Uniform", [8], 1>; +def TFL_QUI32 : TFL_QuantizedType<"Uniform", [32], 0>; +def TFL_QI32 : TFL_QuantizedType<"Uniform", [32], 1>; //===----------------------------------------------------------------------===// // TensorType attribute definitions. @@ -579,7 +582,7 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [ let arguments = (ins TensorOf<[F32, TFL_QI8, TFL_QUI8]>:$input, TensorOf<[F32, TFL_QI8, TFL_QUI8]>:$filter, - TFL_TensorOfOrNone<[F32, TFL_QI8, TFL_QUI8]>:$bias, + TFL_TensorOfOrNone<[F32, TFL_QI32, TFL_QUI32]>:$bias, TFL_AFAttr:$fused_activation_function, TFL_FullyConnectedOptionsWeightFormatAttr:$weights_format, @@ -1096,6 +1099,24 @@ def TFL_LogicalOrOp : TFL_Op<"logical_or", [NoSideEffect]> { let printer = [{ return mlir::impl::printBinaryOp(getOperation(), p); }]; } +def TFL_LogisticOp: TFL_Op<"logistic", [ + NoSideEffect, + SameOperandsAndResultShape, + // zero_point = 0 + // scale = 1. / (max_value + 1) + TFL_FixedResultScale>, + TFL_FixedResultScale>]> { + let summary = "Logistic operator"; + + let description = [{ + Computes element-wise Sigmoid of input + }]; + + let arguments = (ins TensorOf<[AnyFloat, TFL_QI8, TFL_QUI8]>:$x); + + let results = (outs TensorOf<[AnyFloat, TFL_QI8, TFL_QUI8]>:$y); +} + def TFL_LogOp: TFL_Op<"log", [NoSideEffect, SameOperandsAndResultType]> { let summary = "Natural logarithm operator"; @@ -1674,24 +1695,6 @@ def TFL_ShapeOp: TFL_Op<"shape", [NoSideEffect, TFL_NoQuantizableResult]> { let hasOptions = 1; } -def TFL_LogisticOp: TFL_Op<"logistic", [ - NoSideEffect, - SameOperandsAndResultType, - // zero_point = 0 - // scale = 1. / (max_value + 1) - TFL_FixedResultScale>, - TFL_FixedResultScale>]> { - let summary = "Logistic operator"; - - let description = [{ - Computes element-wise Sigmoid of input - }]; - - let arguments = (ins TFL_FpTensor:$x); - - let results = (outs TFL_FpTensor:$y); -} - // TODO(jpienaar): Flesh this out. def TFL_RangeOp: TFL_Op<"range", [NoSideEffect]> { let summary = "Range operator"; diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir index c627b9ebc3e..348a53499ee 100644 --- a/tensorflow/compiler/mlir/lite/tests/ops.mlir +++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir @@ -489,7 +489,7 @@ func @testLogistic(tensor<1x2x3x4x5xbf16>) -> tensor<1x2x3x4x5xbf16> { // test invalid Logistic input func @testLogisticWithWrongInputType(tensor) -> tensor { ^bb0(%arg0: tensor): - // expected-error @+1 {{tfl.logistic' op operand #0 must be tensor of floating-point values}} + // expected-error @+1 {{tfl.logistic' op operand #0 must be tensor of floating-point or QI8 type or QUI8 type values}} %0 = "tfl.logistic"(%arg0): (tensor) -> tensor return %0#0 : tensor } diff --git a/tensorflow/compiler/mlir/lite/tests/quantize.mlir b/tensorflow/compiler/mlir/lite/tests/quantize.mlir index b3b439b2b8a..d0f98158a61 100644 --- a/tensorflow/compiler/mlir/lite/tests/quantize.mlir +++ b/tensorflow/compiler/mlir/lite/tests/quantize.mlir @@ -82,6 +82,23 @@ func @QuantizeDepthwiseConv2D(tensor<1x224x224x3x!quant.uniform>) -> tensor<1x112x112x32x!quant.uniform> { +^bb0(%arg0: tensor<1x224x224x3x!quant.uniform>): + %cst = constant dense<-1.23697901> : tensor<32xf32> + %2 = "tfl.dequantize"(%arg0) : (tensor<1x224x224x3x!quant.uniform>) -> tensor<1x224x224x3xf32> + %3 = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>} : () -> tensor<32x3x3x3x!quant.uniform:f32, 0.021826678373682216:151>> + %4 = "tfl.dequantize"(%3) : (tensor<32x3x3x3x!quant.uniform:f32, 0.021826678373682216:151>>) -> tensor<32x3x3x3xf32> + %5 = "tfl.fully_connected"(%2, %4, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x224x224x3xf32>, tensor<32x3x3x3xf32>, tensor<32xf32>) -> tensor<1x112x112x32xf32> + %6 = "tfl.quantize"(%5) {qtype = tensor<1x112x112x32x!quant.uniform>} : (tensor<1x112x112x32xf32>) -> tensor<1x112x112x32x!quant.uniform> + return %6 : tensor<1x112x112x32x!quant.uniform> + +// CHECK: %0 = "tfl.pseudo_qconst"() {qtype = tensor<32x!quant.uniform>, value = dense<-7254> : tensor<32xi32>} +// CHECK: %1 = "tfl.pseudo_qconst"() {qtype = tensor<32x3x3x3x!quant.uniform:f32, 0.021826678373682216:151>>, value = dense<-76> : tensor<32x3x3x3xi8>} +// CHECK: %2 = "tfl.fully_connected"(%arg0, %1, %0) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} +// CHECK: return %2 +} + // CHECK-LABEL: QuantizeAveragePool2D func @QuantizeAveragePool2D(tensor<1x6x6x16x!quant.uniform>) -> tensor<1x1x1x16xf32> { ^bb0(%arg0: tensor<1x6x6x16x!quant.uniform>): @@ -118,6 +135,18 @@ func @QuantizeSoftmax(tensor<1x6x6x16x!quant.uniform>) // CHECK: return %1 : tensor<1x6x6x16xf32> } +// CHECK-LABEL: QuantizeLogistic +func @QuantizeLogistic(tensor<1x6x6x16x!quant.uniform>) -> tensor<1x6x6x16xf32> { +^bb0(%arg0: tensor<1x6x6x16x!quant.uniform>): + %0 = "tfl.dequantize"(%arg0) : (tensor<1x6x6x16x!quant.uniform>) -> tensor<1x6x6x16xf32> + %1 = "tfl.logistic"(%0) : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32> + return %1 : tensor<1x6x6x16xf32> + +// CHECK: %0 = "tfl.logistic"(%arg0) : (tensor<1x6x6x16x!quant.uniform>) +// CHECK: %1 = "tfl.dequantize"(%0) : (tensor<1x6x6x16x!quant.uniform>) +// CHECK: return %1 +} + // CHECK-LABEL: QuantizeAdd func @QuantizeAdd(tensor<1x56x56x24x!quant.uniform>, tensor<1x56x56x24x!quant.uniform>) -> tensor<1x56x56x24x!quant.uniform> { ^bb0(%arg0: tensor<1x56x56x24x!quant.uniform>, %arg1: tensor<1x56x56x24x!quant.uniform>): From a0568a18d8f4fcce2a2ec6b07bd6f1bff841ab2a Mon Sep 17 00:00:00 2001 From: Igor Ganichev Date: Wed, 24 Jul 2019 15:14:48 -0700 Subject: [PATCH 0511/3053] Don't hold node_queue_mutex_ when destroying EagerNodes in EagerExecutor Unfortunately, some nodes' destructors can enqueue more operations onto this executor and cause a deadlock. Also, destroy `curr_node` after looking it up in node_done_notifications_. This fixes a subtle race condition - a new node can be created with the same address and added to node_done_notifications_ before we get a chance to erase the mapping for the original curr_node. PiperOrigin-RevId: 259826748 --- .../common_runtime/eager/eager_executor.cc | 77 +++++++++++-------- 1 file changed, 47 insertions(+), 30 deletions(-) diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc index ae3369dfbc0..77ac926e919 100644 --- a/tensorflow/core/common_runtime/eager/eager_executor.cc +++ b/tensorflow/core/common_runtime/eager/eager_executor.cc @@ -92,7 +92,7 @@ tensorflow::Status EagerExecutor::status() const { void EagerExecutor::Run() { while (true) { - EagerNode* curr_node; + EagerNode* curr_node_raw; { tensorflow::mutex_lock l(node_queue_mutex_); while (node_queue_.empty() || !status_.ok()) { @@ -100,39 +100,56 @@ void EagerExecutor::Run() { nodes_pending_.wait(l); } // Obtain raw pointer since we don't want to remove from the queue until - // the node has been run. - curr_node = node_queue_.front().get(); + // the node has been run. Otherwise, WaitForAllPendingNodes can return + // too early. + // Note, we don't std::move from the here because the front of the queue + // will then contain a nullptr. This can be a problem in + // WaitForAllPendingNodes where we get the top EagerNode pointer + // and register a notification for its completion. + curr_node_raw = node_queue_.front().get(); } - tensorflow::Status status = curr_node->Run(); + tensorflow::Status status = curr_node_raw->Run(); const bool ok = status.ok(); - tensorflow::mutex_lock l(node_queue_mutex_); - node_queue_.pop(); - if (!ok) { - status_ = status; - // We remove any pending ops so that we don't try to execute them if - // ClearError is called. - errors::AppendToMessage(&status, - ". Encountered when executing an operation using " - "EagerExecutor. This error cancels all future " - "operations and poisons their output tensors."); - for (int i = 0; i < node_queue_.size(); ++i) { - node_queue_.front()->Abort(status); - // Dequeue and delete nodes - node_queue_.pop(); + + std::unique_ptr curr_node; + std::vector> nodes_to_destroy; + { + tensorflow::mutex_lock l(node_queue_mutex_); + curr_node = std::move(node_queue_.front()); + node_queue_.pop(); + if (!ok) { + status_ = status; + // We remove any pending ops so that we don't try to execute them if + // ClearError is called. + errors::AppendToMessage( + &status, + ". Encountered when executing an operation using " + "EagerExecutor. This error cancels all future " + "operations and poisons their output tensors."); + for (int i = 0; i < node_queue_.size(); ++i) { + node_queue_.front()->Abort(status); + nodes_to_destroy.push_back(std::move(node_queue_.front())); + node_queue_.pop(); + } + } + if (!node_done_notifications_.empty()) { + // Note that we notify all waiting threads in case an error has + // occurred. These calling threads are responsible for checking status_ + // before proceeding. + const auto range = + ok ? node_done_notifications_.equal_range(curr_node_raw) + : make_pair(node_done_notifications_.begin(), + node_done_notifications_.end()); + for (auto it = range.first; it != range.second; ++it) { + it->second->notify_all(); + } + node_done_notifications_.erase(range.first, range.second); } } - if (!node_done_notifications_.empty()) { - // Note that we notify all waiting threads in case an error has occurred. - // These calling threads are responsible for checking status_ before - // proceeding. - const auto range = ok ? node_done_notifications_.equal_range(curr_node) - : make_pair(node_done_notifications_.begin(), - node_done_notifications_.end()); - for (auto it = range.first; it != range.second; ++it) { - it->second->notify_all(); - } - node_done_notifications_.erase(range.first, range.second); - } + // curr_node and nodes_to_destroy will be destructed here, while not holding + // node_queue_mutex_. This is important because, unfortunately, some nodes' + // destructors can enqueue more operations onto this executor and cause + // a deadlock. } } From 8cbe52a0f34f7798c8cd8d447440e43da94b021a Mon Sep 17 00:00:00 2001 From: Ian Langmore Date: Wed, 24 Jul 2019 15:15:10 -0700 Subject: [PATCH 0512/3053] TESTFIX: `tests_to_skip` staticmethod renamed to `skip_these_tests`. This prevents conflict with python testing, which automatically runs any method starting with "test" PiperOrigin-RevId: 259826835 --- .../kernel_tests/linalg/linear_operator_circulant_test.py | 4 ++-- .../kernel_tests/linalg/linear_operator_composition_test.py | 2 +- .../kernel_tests/linalg/linear_operator_householder_test.py | 2 +- .../linalg/linear_operator_low_rank_update_test.py | 4 ++-- .../linalg/linear_operator_lower_triangular_test.py | 2 +- .../kernel_tests/linalg/linear_operator_toeplitz_test.py | 2 +- .../kernel_tests/linalg/linear_operator_zeros_test.py | 2 +- tensorflow/python/ops/linalg/linear_operator_test_util.py | 6 +++--- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py index 4c54ec6117c..f0e7efd578f 100644 --- a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py +++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py @@ -246,7 +246,7 @@ class LinearOperatorCirculantTestNonHermitianSpectrum( # Skip Cholesky since we are explicitly testing non-hermitian # spectra. @staticmethod - def tests_to_skip(): + def skip_these_tests(): return ["cholesky"] def operator_and_matrix( @@ -533,7 +533,7 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum( return [dtypes.complex64, dtypes.complex128] @staticmethod - def tests_to_skip(): + def skip_these_tests(): return ["cholesky"] def operator_and_matrix( diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py index 2321a8c6d57..ba611a450c2 100644 --- a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py +++ b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py @@ -44,7 +44,7 @@ class SquareLinearOperatorCompositionTest( self._rtol[dtypes.complex64] = 1e-4 @staticmethod - def tests_to_skip(): + def skip_these_tests(): # Cholesky not implemented. return ["cholesky"] diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py index b333dbf6ff4..4179d450ad1 100644 --- a/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py +++ b/tensorflow/python/kernel_tests/linalg/linear_operator_householder_test.py @@ -46,7 +46,7 @@ class LinearOperatorHouseholderTest( shape_info((2, 1, 4, 4))] @staticmethod - def tests_to_skip(): + def skip_these_tests(): # This linear operator is never positive definite. return ["cholesky"] diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py index 5c89607c1da..c438187e35f 100644 --- a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py +++ b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py @@ -181,7 +181,7 @@ class LinearOperatorLowRankUpdatetestWithDiagCannotUseCholesky( """A = L + UDU^H, D !> 0, L > 0 ==> A !> 0 and we cannot use a Cholesky.""" @staticmethod - def tests_to_skip(): + def skip_these_tests(): return ["cholesky"] _use_diag_update = True @@ -224,7 +224,7 @@ class LinearOperatorLowRankUpdatetestNoDiagCannotUseCholesky( """A = L + UV^H, L > 0 ==> A is not symmetric and we cannot use a Cholesky.""" @staticmethod - def tests_to_skip(): + def skip_these_tests(): return ["cholesky"] _use_diag_update = False diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py index 02ce5b810eb..71d24e316fe 100644 --- a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py +++ b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py @@ -34,7 +34,7 @@ class LinearOperatorLowerTriangularTest( """Most tests done in the base class LinearOperatorDerivedClassTest.""" @staticmethod - def tests_to_skip(): + def skip_these_tests(): # Cholesky does not make sense for triangular matrices. return ["cholesky"] diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py index 22ae26f27b4..dececb81375 100644 --- a/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py +++ b/tensorflow/python/kernel_tests/linalg/linear_operator_toeplitz_test.py @@ -61,7 +61,7 @@ class LinearOperatorToeplitzTest( self._rtol[dtypes.complex128] = 1e-10 @staticmethod - def tests_to_skip(): + def skip_these_tests(): # Skip solve tests, as these could have better stability # (currently exercises the base class). # TODO(srvasude): Enable these when solve is implemented. diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py index 49bbc69149a..086f5eeef3c 100644 --- a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py +++ b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py @@ -37,7 +37,7 @@ class LinearOperatorZerosTest( """Most tests done in the base class LinearOperatorDerivedClassTest.""" @staticmethod - def tests_to_skip(): + def skip_these_tests(): return [ "cholesky", "log_abs_det", "inverse", "solve", "solve_with_broadcast"] diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py index 3d1e1fc2e24..30399bdd3d4 100644 --- a/tensorflow/python/ops/linalg/linear_operator_test_util.py +++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py @@ -178,7 +178,7 @@ class LinearOperatorDerivedClassTest(test.TestCase): raise NotImplementedError("make_x is not defined.") @staticmethod - def tests_to_skip(): + def skip_these_tests(): """List of test names to skip.""" # Subclasses should over-ride if they want to skip some tests. # To skip "test_foo", add "foo" to this list. @@ -569,7 +569,7 @@ def add_tests(test_cls): ] for name, test_template_fn in test_name_dict.items(): - if name in test_cls.tests_to_skip(): + if name in test_cls.skip_these_tests(): continue for dtype, use_placeholder, shape_info in itertools.product( @@ -674,7 +674,7 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest): """ @staticmethod - def tests_to_skip(): + def skip_these_tests(): """List of test names to skip.""" return [ "cholesky", From 16cb1cf58a9ca75091e827c84501a77e0cb03535 Mon Sep 17 00:00:00 2001 From: Ayush Dubey Date: Wed, 24 Jul 2019 15:16:55 -0700 Subject: [PATCH 0513/3053] Automated rollback of commit a5548b54eeb8270a05cfca2da3816f2e56853509 PiperOrigin-RevId: 259827212 --- .../python/tools/api/generator/api_gen.bzl | 4 +- .../tools/api/generator/create_python_api.py | 112 +++++++----------- 2 files changed, 41 insertions(+), 75 deletions(-) diff --git a/tensorflow/python/tools/api/generator/api_gen.bzl b/tensorflow/python/tools/api/generator/api_gen.bzl index 71610d3574b..234addaf782 100644 --- a/tensorflow/python/tools/api/generator/api_gen.bzl +++ b/tensorflow/python/tools/api/generator/api_gen.bzl @@ -92,8 +92,6 @@ def gen_api_init_files( " --compat_init_template=$(location %s)" % compat_init_template ) - loading_flag = " --loading=default" - native.genrule( name = name, outs = all_output_files, @@ -102,7 +100,7 @@ def gen_api_init_files( root_init_template_flag + " --apidir=$(@D)" + output_dir + " --apiname=" + api_name + " --apiversion=" + str(api_version) + compat_api_version_flags + " " + compat_init_template_flags + - loading_flag + " --package=" + ",".join(packages) + + " --package=" + ",".join(packages) + " --output_package=" + output_package + " $(OUTS)" ), srcs = srcs, diff --git a/tensorflow/python/tools/api/generator/create_python_api.py b/tensorflow/python/tools/api/generator/create_python_api.py index 98cd159a63f..a8a1c760637 100644 --- a/tensorflow/python/tools/api/generator/create_python_api.py +++ b/tensorflow/python/tools/api/generator/create_python_api.py @@ -75,6 +75,34 @@ class SymbolExposedTwiceError(Exception): pass +def format_import(source_module_name, source_name, dest_name): + """Formats import statement. + + Args: + source_module_name: (string) Source module to import from. + source_name: (string) Source symbol name to import. + dest_name: (string) Destination alias name. + + Returns: + An import statement string. + """ + if _LAZY_LOADING: + return " '%s': ('%s', '%s')," % (dest_name, source_module_name, + source_name) + else: + if source_module_name: + if source_name == dest_name: + return 'from %s import %s' % (source_module_name, source_name) + else: + return 'from %s import %s as %s' % (source_module_name, source_name, + dest_name) + else: + if source_name == dest_name: + return 'import %s' % source_name + else: + return 'import %s as %s' % (source_name, dest_name) + + def get_canonical_import(import_set): """Obtain one single import from a set of possible sources of a symbol. @@ -105,7 +133,7 @@ def get_canonical_import(import_set): class _ModuleInitCodeBuilder(object): """Builds a map from module name to imports included in that module.""" - def __init__(self, output_package, api_version, lazy_loading=_LAZY_LOADING): + def __init__(self, output_package, api_version): self._output_package = output_package # Maps API module to API symbol name to set of tuples of the form # (module name, priority). @@ -117,9 +145,6 @@ class _ModuleInitCodeBuilder(object): # Names that start with underscore in the root module. self._underscore_names_in_root = [] self._api_version = api_version - # Controls whether or not exported symbols are lazily loaded or statically - # imported. - self._lazy_loading = lazy_loading def _check_already_imported(self, symbol_id, api_name): if (api_name in self._dest_import_to_id and @@ -146,7 +171,7 @@ class _ModuleInitCodeBuilder(object): SymbolExposedTwiceError: Raised when an import with the same dest_name has already been added to dest_module_name. """ - import_str = self.format_import(source_module_name, source_name, dest_name) + import_str = format_import(source_module_name, source_name, dest_name) # Check if we are trying to expose two different symbols with same name. full_api_name = dest_name @@ -186,7 +211,7 @@ class _ModuleInitCodeBuilder(object): submodule = module_split[submodule_index-1] parent_module += '.' + submodule if parent_module else submodule import_from = self._output_package - if self._lazy_loading: + if _LAZY_LOADING: import_from += '.' + '.'.join(module_split[:submodule_index + 1]) self.add_import( symbol=None, @@ -222,7 +247,7 @@ class _ModuleInitCodeBuilder(object): get_canonical_import(imports) for _, imports in dest_name_to_imports.items() ] - if self._lazy_loading: + if _LAZY_LOADING: module_text_map[ dest_module] = _LAZY_LOADING_MODULE_TEXT_TEMPLATE % '\n'.join( sorted(imports_list)) @@ -233,7 +258,7 @@ class _ModuleInitCodeBuilder(object): # from it using * import. Don't need this for lazy_loading because the # underscore symbols are already included in __all__ when passed in and # handled by TFModuleWrapper. - if not self._lazy_loading: + if not _LAZY_LOADING: underscore_names_str = ', '.join( '\'%s\'' % name for name in self._underscore_names_in_root) @@ -250,10 +275,9 @@ __all__.extend([_s for _s in _names_with_underscore]) if not dest_module.startswith(_COMPAT_MODULE_PREFIX): deprecation = 'True' # Workaround to make sure not load lite from lite/__init__.py - if (not dest_module and 'lite' in self._module_imports - and self._lazy_loading): + if not dest_module and 'lite' in self._module_imports and _LAZY_LOADING: has_lite = 'True' - if self._lazy_loading: + if _LAZY_LOADING: public_apis_name = '_PUBLIC_APIS' else: public_apis_name = 'None' @@ -262,33 +286,6 @@ __all__.extend([_s for _s in _names_with_underscore]) return module_text_map, footer_text_map - def format_import(self, source_module_name, source_name, dest_name): - """Formats import statement. - - Args: - source_module_name: (string) Source module to import from. - source_name: (string) Source symbol name to import. - dest_name: (string) Destination alias name. - - Returns: - An import statement string. - """ - if self._lazy_loading: - return " '%s': ('%s', '%s')," % (dest_name, source_module_name, - source_name) - else: - if source_module_name: - if source_name == dest_name: - return 'from %s import %s' % (source_module_name, source_name) - else: - return 'from %s import %s as %s' % (source_module_name, source_name, - dest_name) - else: - if source_name == dest_name: - return 'import %s' % source_name - else: - return 'import %s as %s' % (source_name, dest_name) - def _get_name_and_module(full_name): """Split full_name into module and short name. @@ -371,8 +368,7 @@ def get_api_init_text(packages, output_package, api_name, api_version, - compat_api_versions=None, - lazy_loading=_LAZY_LOADING): + compat_api_versions=None): """Get a map from destination module to __init__.py code for that module. Args: @@ -384,8 +380,6 @@ def get_api_init_text(packages, api_version: API version you want to generate (1 or 2). compat_api_versions: Additional API versions to generate under compat/ directory. - lazy_loading: Boolean flag. If True, a lazy loading `__init__.py` file is - produced and if `False`, static imports are used. Returns: A dictionary where @@ -395,8 +389,7 @@ def get_api_init_text(packages, """ if compat_api_versions is None: compat_api_versions = [] - module_code_builder = _ModuleInitCodeBuilder( - output_package, api_version, lazy_loading) + module_code_builder = _ModuleInitCodeBuilder(output_package, api_version) # Traverse over everything imported above. Specifically, # we want to traverse over TensorFlow Python modules. @@ -498,8 +491,7 @@ def get_module_docstring(module_name, package, api_name): def create_api_files(output_files, packages, root_init_template, output_dir, output_package, api_name, api_version, - compat_api_versions, compat_init_templates, - lazy_loading=_LAZY_LOADING): + compat_api_versions, compat_init_templates): """Creates __init__.py files for the Python API. Args: @@ -517,8 +509,6 @@ def create_api_files(output_files, packages, root_init_template, output_dir, subdirectory. compat_init_templates: List of templates for top level compat init files in the same order as compat_api_versions. - lazy_loading: Boolean flag. If True, a lazy loading `__init__.py` file is - produced and if `False`, static imports are used. Raises: ValueError: if output_files list is missing a required file. @@ -536,7 +526,7 @@ def create_api_files(output_files, packages, root_init_template, output_dir, module_text_map, deprecation_footer_map = get_api_init_text( packages, output_package, api_name, - api_version, compat_api_versions, lazy_loading) + api_version, compat_api_versions) # Add imports to output files. missing_output_files = [] @@ -631,14 +621,6 @@ def main(): parser.add_argument( '--output_package', default='tensorflow', type=str, help='Root output package.') - parser.add_argument( - '--loading', default='default', type=str, - choices=['lazy', 'static', 'default'], - help='Controls how the generated __init__.py file loads the exported ' - 'symbols. \'lazy\' means the symbols are loaded when first used. ' - '\'static\' means all exported symbols are loaded in the ' - '__init__.py file. \'default\' uses the value of the ' - '_LAZY_LOADING constant in create_python_api.py.') args = parser.parse_args() if len(args.outputs) == 1: @@ -653,23 +635,9 @@ def main(): packages = args.packages.split(',') for package in packages: importlib.import_module(package) - - # Determine if the modules shall be loaded lazily or statically. - if args.loading == 'default': - lazy_loading = _LAZY_LOADING - elif args.loading == 'lazy': - lazy_loading = True - elif args.loading == 'static': - lazy_loading = False - else: - # This should never happen (tm). - raise ValueError('Invalid value for --loading flag: %s. Must be one of ' - 'lazy, static, default.' % args.loading) - create_api_files(outputs, packages, args.root_init_template, args.apidir, args.output_package, args.apiname, args.apiversion, - args.compat_apiversions, args.compat_init_templates, - lazy_loading) + args.compat_apiversions, args.compat_init_templates) if __name__ == '__main__': From e4e1a4f18550f99040fe63a58917105a33bfb85f Mon Sep 17 00:00:00 2001 From: Edward Loper Date: Wed, 24 Jul 2019 15:30:19 -0700 Subject: [PATCH 0514/3053] tf.ragged.stack_dynamic_partitions: Stacks dynamic partitions of a Tensor or RaggedTensor. E.g.: >>> data = ['a', 'b', 'c', 'd', 'e'] >>> partitions = [ 3, 0, 2, 2, 3] >>> num_partitions = 5 >>> tf.ragged.stack_dynamic_partitions(data, partitions, num_partitions) PiperOrigin-RevId: 259829821 --- tensorflow/python/ops/ragged/BUILD | 18 ++ .../python/ops/ragged/ragged_array_ops.py | 108 ++++++++ .../ragged_dynamic_partition_op_test.py | 257 ++++++++++++++++++ .../api/golden/v1/tensorflow.ragged.pbtxt | 4 + .../api/golden/v2/tensorflow.ragged.pbtxt | 4 + 5 files changed, 391 insertions(+) create mode 100644 tensorflow/python/ops/ragged/ragged_dynamic_partition_op_test.py diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD index 2e0b6884b64..f1a802b8c7d 100644 --- a/tensorflow/python/ops/ragged/BUILD +++ b/tensorflow/python/ops/ragged/BUILD @@ -62,6 +62,7 @@ py_library( "//tensorflow/python:dtypes", "//tensorflow/python:framework_ops", "//tensorflow/python:math_ops", + "//tensorflow/python:sort_ops", "//tensorflow/python:tensor_util", "//tensorflow/python:util", ], @@ -1052,3 +1053,20 @@ py_test( "@absl_py//absl/testing:parameterized", ], ) + +py_test( + name = "ragged_dynamic_partition_op_test", + srcs = ["ragged_dynamic_partition_op_test.py"], + python_version = "PY3", + srcs_version = "PY2AND3", + deps = [ + ":ragged_array_ops", + ":ragged_factory_ops", + "//tensorflow/python:array_ops", + "//tensorflow/python:constant_op", + "//tensorflow/python:errors", + "//tensorflow/python:framework_test_lib", + "//tensorflow/python:platform_test", + "@absl_py//absl/testing:parameterized", + ], +) diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py index 7714217fe50..e41e605b847 100644 --- a/tensorflow/python/ops/ragged/ragged_array_ops.py +++ b/tensorflow/python/ops/ragged/ragged_array_ops.py @@ -22,7 +22,9 @@ from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_util from tensorflow.python.ops import array_ops +from tensorflow.python.ops import check_ops from tensorflow.python.ops import math_ops +from tensorflow.python.ops import sort_ops from tensorflow.python.ops.ragged import ragged_functional_ops from tensorflow.python.ops.ragged import ragged_math_ops from tensorflow.python.ops.ragged import ragged_tensor @@ -520,3 +522,109 @@ def rank(input, name=None): # pylint: disable=redefined-builtin return array_ops.rank(input, name) return input.ragged_rank + array_ops.rank(input.flat_values) + + +#=============================================================================== +# ragged.stack_dynamic_partitions +#=============================================================================== +@tf_export('ragged.stack_dynamic_partitions') +def stack_dynamic_partitions(data, partitions, num_partitions, name=None): + """Stacks dynamic partitions of a Tensor or RaggedTensor. + + Returns a RaggedTensor `output` with `num_partitions` rows, where the row + `output[i]` is formed by stacking all slices `data[j1...jN]` such that + `partitions[j1...jN] = i`. Slices of `data` are stacked in row-major + order. + + If `num_partitions` is an `int` (not a `Tensor`), then this is equivalent to + `tf.ragged.stack(tf.dynamic_partition(data, partitions, num_partitions))`. + + ####Example: + ```python + >>> data = ['a', 'b', 'c', 'd', 'e'] + >>> partitions = [ 3, 0, 2, 2, 3] + >>> num_partitions = 5 + >>> tf.ragged.stack_dynamic_partitions(data, partitions, num_partitions) + + ``` + + Args: + data: A `Tensor` or `RaggedTensor` containing the values to stack. + partitions: An `int32` or `int64` `Tensor` or `RaggedTensor` specifying the + partition that each slice of `data` should be added to. + `partitions.shape` must be a prefix of `data.shape`. Values must be + greater than or equal to zero, and less than `num_partitions`. + `partitions` is not required to be sorted. + num_partitions: An `int32` or `int64` scalar specifying the number of + partitions to output. This determines the number of rows in `output`. + name: A name prefix for the returned tensor (optional). + + Returns: + A `RaggedTensor` containing the stacked partitions. The returned tensor + has the same dtype as `data`, and its shape is + `[num_partitions, (D)] + data.shape[partitions.rank:]`, where `(D)` is a + ragged dimension whose length is the number of data slices stacked for + each `partition`. + """ + with ops.name_scope(name, 'SegmentStack', [data, partitions, num_partitions]): + # Convert inputs to tensors. + data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name='data') + row_splits_dtype = ( + data.row_splits.dtype + if isinstance(data, ragged_tensor.RaggedTensor) else None) + partitions = ragged_tensor.convert_to_tensor_or_ragged_tensor( + partitions, name='partitions', preferred_dtype=row_splits_dtype) + num_partitions = ops.convert_to_tensor( + num_partitions, name='num_partitions', preferred_dtype=partitions.dtype) + if row_splits_dtype is not None: + partitions = math_ops.cast(partitions, row_splits_dtype) + num_partitions = math_ops.cast(num_partitions, partitions.dtype) + + # Sanity-checks for shapes. + partitions_rank = partitions.shape.ndims + if partitions_rank is None: + raise ValueError('partitions must have known rank.') + num_partitions.shape.assert_has_rank(0) + partitions.shape.assert_is_compatible_with(data.shape[:partitions_rank]) + + if partitions_rank == 0: + # If partitions is a scalar, then just create a RaggedTensor containing + # that single the complete `data` value in the specified row. + return ragged_tensor.RaggedTensor.from_value_rowids( + values=array_ops.stack([data]), + value_rowids=array_ops.stack([partitions]), + nrows=num_partitions, + validate=False) + + elif partitions_rank == 1: + # If partitions is a vector (the typical case): we can just use data and + # partitions as the `values` and `value_rowids` for `from_value_rowids`, + # as long as we sort them first. + permutation = sort_ops.argsort(partitions, stable=True) + value_rowids = array_ops.gather(partitions, permutation) + values = array_ops.gather(data, permutation) + check = check_ops.assert_less( + value_rowids[-1:], + num_partitions, + message='partitions must be less than num_partitions') + with ops.control_dependencies([check]): + return ragged_tensor.RaggedTensor.from_value_rowids( + values, value_rowids, nrows=num_partitions, validate=False) + + else: + # Handle higher-dimensional partitions via recursion. + if not isinstance(data, ragged_tensor.RaggedTensor): + data = ragged_tensor.RaggedTensor.from_tensor( + data, row_splits_dtype=partitions.dtype, ragged_rank=1) + if not isinstance(partitions, ragged_tensor.RaggedTensor): + partitions = ragged_tensor.RaggedTensor.from_tensor( + partitions, + row_splits_dtype=partitions.dtype, + ragged_rank=max(data.ragged_rank, partitions_rank - 1)) + check = check_ops.assert_equal( + data.row_splits, + partitions.row_splits, + message='data and partitions have incompatible ragged shapes') + with ops.control_dependencies([check]): + return stack_dynamic_partitions(data.values, partitions.values, + num_partitions) diff --git a/tensorflow/python/ops/ragged/ragged_dynamic_partition_op_test.py b/tensorflow/python/ops/ragged/ragged_dynamic_partition_op_test.py new file mode 100644 index 00000000000..790cabdaf6f --- /dev/null +++ b/tensorflow/python/ops/ragged/ragged_dynamic_partition_op_test.py @@ -0,0 +1,257 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for ragged_array_ops.stack_dynamic_partitions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl.testing import parameterized + +from tensorflow.python.eager import context +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors +from tensorflow.python.framework import test_util +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import data_flow_ops +from tensorflow.python.ops.ragged import ragged_array_ops +from tensorflow.python.ops.ragged import ragged_concat_ops +from tensorflow.python.ops.ragged import ragged_factory_ops +from tensorflow.python.platform import googletest + + +@test_util.run_all_in_graph_and_eager_modes +class RaggedSegmentStackOpTest(test_util.TensorFlowTestCase, + parameterized.TestCase): + + @parameterized.parameters([ + dict( # empty inputs + data=[], + partitions=[], + num_partitions=0, + expected=[], + expected_ragged_rank=1), + dict( # empty data, num_partitions>0 + data=[], + partitions=[], + num_partitions=3, + expected=[[], [], []]), + dict( # 1D data, 1D partitions (docstring example) + data=['a', 'b', 'c', 'd', 'e'], + partitions=[3, 0, 2, 2, 3], + num_partitions=5, + expected=[['b'], [], ['c', 'd'], ['a', 'e'], []]), + dict( # 2D data, 1D partitions + data=[['a', 'b'], ['c', 'd'], ['e', 'f'], ['g', 'h']], + data_ragged_rank=0, + partitions=[2, 1, 2, 3], + num_partitions=4, + expected=[[], [['c', 'd']], [['a', 'b'], ['e', 'f']], [['g', 'h']]], + expected_ragged_rank=1), + dict( # 2D ragged data, 1D partitions + data=[['a'], ['b', 'c', 'd'], [], ['e', 'f']], + data_ragged_rank=1, + partitions=[2, 1, 2, 3], + num_partitions=4, + expected=[[], [['b', 'c', 'd']], [['a'], []], [['e', 'f']]], + expected_ragged_rank=2), + dict( # 2D data, 2D partitions + data=[['a', 'b'], ['c', 'd'], ['e', 'f'], ['g', 'h']], + data_ragged_rank=0, + partitions=[[3, 0], [2, 2], [4, 3], [2, 0]], + num_partitions=5, + expected=[['b', 'h'], [], ['c', 'd', 'g'], ['a', 'f'], ['e']]), + dict( # 2D ragged data, 2D ragged partitions + data=[['a', 'b'], ['c', 'd'], ['e', 'f'], ['g', 'h']], + data_ragged_rank=0, + partitions=[[3, 0], [2, 2], [4, 3], [2, 0]], + num_partitions=5, + expected=[['b', 'h'], [], ['c', 'd', 'g'], ['a', 'f'], ['e']]), + dict( # 3D data, 1d partitions + data=[[['a', 'b'], ['c', 'd']], [['e', 'f'], ['g', 'h']]], + data_ragged_rank=0, + partitions=[1, 0], + num_partitions=2, + expected=[[[['e', 'f'], ['g', 'h']]], [[['a', 'b'], ['c', 'd']]]], + expected_ragged_rank=1), + dict( # 3D data (ragged_rank=1), 1d partitions + data=[[['a', 'b'], ['c', 'd']], [['e', 'f']]], + data_ragged_rank=1, + partitions=[2, 0], + num_partitions=3, + expected=[[[['e', 'f']]], [], [[['a', 'b'], ['c', 'd']]]], + expected_ragged_rank=2), + dict( # 3D data (ragged_rank=2), 1d partitions + data=[[['a', 'b'], ['c', 'd']], [['e', 'f', 'g', 'h']]], + data_ragged_rank=2, + partitions=[2, 0], + num_partitions=3, + expected=[[[['e', 'f', 'g', 'h']]], [], [[['a', 'b'], ['c', 'd']]]], + expected_ragged_rank=3), + dict( # 3D data, 2d partitions + data=[[['a', 'b'], ['c', 'd']], [['e', 'f'], ['g', 'h']]], + data_ragged_rank=0, + partitions=[[1, 0], [0, 3]], + segment_ids_ragged_rank=0, + num_partitions=4, + expected=[[['c', 'd'], ['e', 'f']], [['a', 'b']], [], [['g', 'h']]], + expected_ragged_rank=1), + dict( # 3D data (ragged_rank=1), 2d partitions + data=[[['a', 'b'], ['c', 'd']], [['e', 'f']]], + data_ragged_rank=1, + partitions=[[1, 0], [0]], + segment_ids_ragged_rank=1, + num_partitions=2, + expected=[[['c', 'd'], ['e', 'f']], [['a', 'b']]], + expected_ragged_rank=1), + dict( # 3D data (ragged_rank=2), 2d partitions + data=[[['a', 'b'], ['c', 'd']], [['e', 'f', 'g', 'h']]], + data_ragged_rank=2, + partitions=[[1, 0], [0]], + segment_ids_ragged_rank=1, + num_partitions=3, + expected=[[['c', 'd'], ['e', 'f', 'g', 'h']], [['a', 'b']], []], + expected_ragged_rank=2), + dict( # 3D data (ragged_rank=2), 3d partitions (ragged_rank=2) + data=[[['a', 'b'], ['c', 'd']], [['e', 'f', 'g', 'h']]], + data_ragged_rank=2, + partitions=[[[3, 0], [1, 2]], [[1, 1, 0, 1]]], + segment_ids_ragged_rank=2, + num_partitions=4, + expected=[['b', 'g'], ['c', 'e', 'f', 'h'], ['d'], ['a']]), + dict( # 0D data, 0D partitions + data='a', + partitions=3, + num_partitions=5, + expected=[[], [], [], ['a'], []]), + dict( # 1D data, 0D partitions + data=['a', 'b', 'c'], + partitions=3, + num_partitions=5, + expected=[[], [], [], [['a', 'b', 'c']], []], + expected_ragged_rank=1), + dict( # 2D data, 0D partitions + data=[['a', 'b'], ['c', 'd']], + data_ragged_rank=0, + partitions=3, + num_partitions=5, + expected=[[], [], [], [[['a', 'b'], ['c', 'd']]], []], + expected_ragged_rank=1), + dict( # 2D data (ragged_rank=1), 0D partitions + data=[['a', 'b'], ['c']], + data_ragged_rank=1, + partitions=3, + num_partitions=5, + expected=[[], [], [], [[['a', 'b'], ['c']]], []], + expected_ragged_rank=3), + ]) + def testRaggedSegmentStack(self, + data, + partitions, + num_partitions, + expected, + data_ragged_rank=None, + segment_ids_ragged_rank=None, + expected_ragged_rank=None): + for seg_dtype in [dtypes.int32, dtypes.int64]: + data_tensor = ragged_factory_ops.constant( + data, row_splits_dtype=seg_dtype, ragged_rank=data_ragged_rank) + segment_ids_tensor = ragged_factory_ops.constant( + partitions, + dtype=seg_dtype, + row_splits_dtype=seg_dtype, + ragged_rank=segment_ids_ragged_rank) + expected_tensor = ragged_factory_ops.constant( + expected, + row_splits_dtype=seg_dtype, + ragged_rank=expected_ragged_rank) + result = ragged_array_ops.stack_dynamic_partitions( + data_tensor, segment_ids_tensor, num_partitions) + self.assertAllEqual(result, expected_tensor) + + # Check that it's equivalent to tf.stack(dynamic_partition(...)), + # where applicable. + if (data_ragged_rank == 0 and segment_ids_ragged_rank == 0 and + seg_dtype == dtypes.int32): + equiv = ragged_concat_ops.stack( + data_flow_ops.dynamic_partition(data_tensor, segment_ids_tensor, + num_partitions)) + self.assertAllEqual(result, self.evaluate(equiv).to_list()) + + @parameterized.parameters([ + dict( + data=['a', 'b', 'c'], + partitions=[2, -1, 0], + num_partitions=10, + error='must be non-negative'), + dict( + data=['a', 'b', 'c'], + partitions=[2, 10, 0], + num_partitions=1, + error='partitions must be less than num_partitions'), + dict( + data=['a', 'b', 'c'], + partitions=[2, 10, 0], + num_partitions=10, + error='partitions must be less than num_partitions'), + dict( + data=[['a', 'b'], ['c']], + partitions=[[2], [3, 0]], + num_partitions=10, + error='data and partitions have incompatible ragged shapes'), + ]) + def testRuntimeError(self, data, partitions, num_partitions, error): + data = ragged_factory_ops.constant(data) + partitions = ragged_factory_ops.constant(partitions, dtype=dtypes.int64) + with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError), + error): + self.evaluate( + ragged_array_ops.stack_dynamic_partitions(data, partitions, + num_partitions)) + + @parameterized.parameters([ + dict( + data=['a', 'b', 'c'], + partitions=[1, 2], + num_partitions=10, + error=r'Shapes \(2,\) and \(3,\) are incompatible'), + dict( + data=[['a', 'b'], ['c', 'd']], + partitions=[[1, 2, 3], [4, 5, 6]], + num_partitions=10, + error=r'Shapes \(2, 3\) and \(2, 2\) are incompatible'), + dict( + data=['a', 'b', 'c'], + partitions=[1, 2, 3], + num_partitions=[1, 2, 3], + error='must have rank 0'), + ]) + def testStaticError(self, data, partitions, num_partitions, error): + with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError), + error): + ragged_array_ops.stack_dynamic_partitions(data, partitions, + num_partitions) + + def testUnknownRankError(self): + if context.executing_eagerly(): + return + partitions = array_ops.placeholder(dtypes.int32, None) + with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError), + 'partitions must have known rank'): + ragged_array_ops.stack_dynamic_partitions(['a', 'b', 'c'], partitions, 10) + + +if __name__ == '__main__': + googletest.main() diff --git a/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt index 55ad2621d80..c37b5118dbd 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.ragged.pbtxt @@ -40,4 +40,8 @@ tf_module { name: "stack" argspec: "args=[\'values\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], " } + member_method { + name: "stack_dynamic_partitions" + argspec: "args=[\'data\', \'partitions\', \'num_partitions\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } } diff --git a/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt index 2420aa902e0..75144f1cf97 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.ragged.pbtxt @@ -28,4 +28,8 @@ tf_module { name: "stack" argspec: "args=[\'values\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], " } + member_method { + name: "stack_dynamic_partitions" + argspec: "args=[\'data\', \'partitions\', \'num_partitions\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " + } } From dead6246b875522fffe06d16ae218c900982bf33 Mon Sep 17 00:00:00 2001 From: Ashwin Murthy Date: Wed, 24 Jul 2019 15:30:27 -0700 Subject: [PATCH 0515/3053] [TFLite] Export stateful tensors in TFLite flatbuffer using the StatefulOperand OpTrait. These are currently used by the LSTM ops in TFLite. PiperOrigin-RevId: 259829846 --- .../mlir/lite/flatbuffer_translate.cc | 33 +- .../mlir/lite/tests/mlir2flatbuffer/lstm.mlir | 284 ++++++++++++++++++ 2 files changed, 315 insertions(+), 2 deletions(-) create mode 100644 tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc index 5f460b45c16..1e01e5012ff 100644 --- a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc +++ b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc @@ -414,6 +414,10 @@ class Translator { // mapping. void InitializeNamesFromAttribute(FuncOp fn); + // Determines if the specified operation op's operand at operand_index + // is marked as a stateful operand. + bool IsStatefulOperand(mlir::Operation* op, int operand_index); + // Returns a unique name for `op`. std::string UniqueName(mlir::Operation* op); @@ -559,10 +563,19 @@ Optional> Translator::BuildTensor( } else { q_params = tflite::CreateQuantizationParameters(builder_); } - + // Check if the value's uses includes an op and usage at an operand index + // marked as a stateful. If so, set the tensor's is_variable as true + // This is v1 ref variable semantics in the TFLite runtime. + bool is_variable = false; + for (auto& use : value->getUses()) { + is_variable = IsStatefulOperand(use.getOwner(), use.getOperandNumber()); + if (is_variable) { + break; + } + } return tflite::CreateTensor( builder_, builder_.CreateVector(shape), tflite_element_type, buffer_idx, - builder_.CreateString(name), q_params, /*is_variable=*/false); + builder_.CreateString(name), q_params, /*is_variable=*/is_variable); } BufferOffset Translator::BuildIfOperator( @@ -859,6 +872,22 @@ void Translator::InitializeNamesFromAttribute(FuncOp fn) { } } +bool Translator::IsStatefulOperand(mlir::Operation* op, int operand_index) { + std::vector operand_indices; + // TODO(b/138254427): When the bug is addressed, we'll be able to inspect + // for the presence of a specific OpTrait using mlir::Operation, without + // having to cast it to specific ops like below. + // Until then, when a new RNN/LSTM op is added to TFLite and has stateful + // tensors as operands, they will need to be added here as well. + if (auto tfl = llvm::dyn_cast(op)) { + operand_indices = tfl.GetStatefulOperands(); + } else if (auto tfl = + llvm::dyn_cast(op)) { + operand_indices = tfl.GetStatefulOperands(); + } + return absl::c_find(operand_indices, operand_index) != operand_indices.end(); +} + Optional> Translator::BuildSubGraph(FuncOp fn) { InitializeNamesFromAttribute(fn); std::vector> tensors; diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir new file mode 100644 index 00000000000..1bea2b01714 --- /dev/null +++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/lstm.mlir @@ -0,0 +1,284 @@ +// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s + +func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -> tensor<4 x f32> { +// CHECK: { +// CHECK-NEXT: version: 3, +// CHECK-NEXT: operator_codes: [ { +// CHECK-NEXT: builtin_code: LSTM +// CHECK-NEXT: } ], +// CHECK-NEXT: subgraphs: [ { +// CHECK-NEXT: tensors: [ { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 1, +// CHECK-NEXT: name: "tfl.pseudo_input", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 2, +// CHECK-NEXT: name: "tfl.pseudo_input1", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 3, +// CHECK-NEXT: name: "tfl.pseudo_input2", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 4, +// CHECK-NEXT: name: "tfl.pseudo_input3", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 5, +// CHECK-NEXT: name: "tfl.pseudo_input4", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 6, +// CHECK-NEXT: name: "tfl.pseudo_input5", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 7, +// CHECK-NEXT: name: "tfl.pseudo_input6", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 8, +// CHECK-NEXT: name: "tfl.pseudo_input7", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 9, +// CHECK-NEXT: name: "tfl.pseudo_input8", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 10, +// CHECK-NEXT: name: "tfl.pseudo_input9", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 11, +// CHECK-NEXT: name: "tfl.pseudo_input10", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 12, +// CHECK-NEXT: name: "tfl.pseudo_input11", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 13, +// CHECK-NEXT: name: "tfl.pseudo_input12", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 14, +// CHECK-NEXT: name: "tfl.pseudo_input13", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 15, +// CHECK-NEXT: name: "tfl.pseudo_input14", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 16, +// CHECK-NEXT: name: "tfl.pseudo_input15", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 17, +// CHECK-NEXT: name: "tfl.pseudo_input16", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 18, +// CHECK-NEXT: name: "tfl.pseudo_input17", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 19, +// CHECK-NEXT: name: "tfl.pseudo_input18", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: }, +// CHECK-NEXT: is_variable: true +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 20, +// CHECK-NEXT: name: "tfl.pseudo_input19", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: }, +// CHECK-NEXT: is_variable: true +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 21, +// CHECK-NEXT: name: "tfl.pseudo_input20", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 22, +// CHECK-NEXT: name: "tfl.pseudo_input21", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 23, +// CHECK-NEXT: name: "tfl.pseudo_input22", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 24, +// CHECK-NEXT: name: "tfl.pseudo_input23", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 25, +// CHECK-NEXT: name: "tfl.lstm", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: } ], +// CHECK-NEXT: inputs: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 ], +// CHECK-NEXT: outputs: [ 24 ], +// CHECK-NEXT: operators: [ { +// CHECK-NEXT: inputs: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 ], +// CHECK-NEXT: outputs: [ 24 ], +// CHECK-NEXT: builtin_options_type: LSTMOptions, +// CHECK-NEXT: builtin_options: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: } ], +// CHECK-NEXT: name: "main" +// CHECK-NEXT: } ], +// CHECK-NEXT: description: "MLIR Converted.", +// CHECK-NEXT: buffers: [ { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: } ] +// CHECK-NEXT: } +// CHECK-EMPTY: + +^bb0(%arg0: tensor<4 x f32>, %arg1: tensor<4 x f32>, %arg2: tensor<4 x f32>, %arg3: tensor<4 x f32>, %arg4: tensor<4 x f32>, %arg5: tensor<4 x f32>, %arg6: tensor<4 x f32>, %arg7: tensor<4 x f32>, %arg8: tensor<4 x f32>, %arg9: tensor<4 x f32>, %arg10: tensor<4 x f32>, %arg11: tensor<4 x f32>, %arg12: tensor<4 x f32>, %arg13: tensor<4 x f32>, %arg14: tensor<4 x f32>, %arg15: tensor<4 x f32>, %arg16: tensor<4 x f32>, %arg17: tensor<4 x f32>, %arg18: tensor<4 x f32>, %arg19: tensor<4 x f32>, %arg20: tensor<4 x f32>, %arg21: tensor<4 x f32>, %arg22: tensor<4 x f32>, %arg23: tensor<4 x f32>): + %0 = "tfl.pseudo_input" (%arg0) : (tensor<4 x f32>) -> tensor<4 x f32> + %1 = "tfl.pseudo_input" (%arg1) : (tensor<4 x f32>) -> tensor<4 x f32> + %2 = "tfl.pseudo_input" (%arg2) : (tensor<4 x f32>) -> tensor<4 x f32> + %3 = "tfl.pseudo_input" (%arg3) : (tensor<4 x f32>) -> tensor<4 x f32> + %4 = "tfl.pseudo_input" (%arg4) : (tensor<4 x f32>) -> tensor<4 x f32> + %5 = "tfl.pseudo_input" (%arg5) : (tensor<4 x f32>) -> tensor<4 x f32> + %6 = "tfl.pseudo_input" (%arg6) : (tensor<4 x f32>) -> tensor<4 x f32> + %7 = "tfl.pseudo_input" (%arg7) : (tensor<4 x f32>) -> tensor<4 x f32> + %8 = "tfl.pseudo_input" (%arg8) : (tensor<4 x f32>) -> tensor<4 x f32> + %9 = "tfl.pseudo_input" (%arg9) : (tensor<4 x f32>) -> tensor<4 x f32> + %10 = "tfl.pseudo_input" (%arg10) : (tensor<4 x f32>) -> tensor<4 x f32> + %11 = "tfl.pseudo_input" (%arg11) : (tensor<4 x f32>) -> tensor<4 x f32> + %12 = "tfl.pseudo_input" (%arg12) : (tensor<4 x f32>) -> tensor<4 x f32> + %13 = "tfl.pseudo_input" (%arg13) : (tensor<4 x f32>) -> tensor<4 x f32> + %14 = "tfl.pseudo_input" (%arg14) : (tensor<4 x f32>) -> tensor<4 x f32> + %15 = "tfl.pseudo_input" (%arg15) : (tensor<4 x f32>) -> tensor<4 x f32> + %16 = "tfl.pseudo_input" (%arg16) : (tensor<4 x f32>) -> tensor<4 x f32> + %17 = "tfl.pseudo_input" (%arg17) : (tensor<4 x f32>) -> tensor<4 x f32> + %18 = "tfl.pseudo_input" (%arg18) : (tensor<4 x f32>) -> tensor<4 x f32> + %19 = "tfl.pseudo_input" (%arg19) : (tensor<4 x f32>) -> tensor<4 x f32> + %20 = "tfl.pseudo_input" (%arg20) : (tensor<4 x f32>) -> tensor<4 x f32> + %21 = "tfl.pseudo_input" (%arg21) : (tensor<4 x f32>) -> tensor<4 x f32> + %22 = "tfl.pseudo_input" (%arg22) : (tensor<4 x f32>) -> tensor<4 x f32> + %23 = "tfl.pseudo_input" (%arg23) : (tensor<4 x f32>) -> tensor<4 x f32> + %24 = "tfl.lstm"(%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23) {fused_activation_function = "NONE", kernel_type = "FULL"} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> + return %24 : tensor<4xf32> +} \ No newline at end of file From 630bd06d4aa20a0193dd51cb0a5635ba67d61140 Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Wed, 24 Jul 2019 15:40:10 -0700 Subject: [PATCH 0516/3053] Start a section on limitations. PiperOrigin-RevId: 259831791 --- .../python/autograph/g3doc/reference/index.md | 11 + .../python/autograph/g3doc/reference/intro.md | 9 - .../autograph/g3doc/reference/limitations.md | 272 ++++++++++++++++++ 3 files changed, 283 insertions(+), 9 deletions(-) create mode 100644 tensorflow/python/autograph/g3doc/reference/limitations.md diff --git a/tensorflow/python/autograph/g3doc/reference/index.md b/tensorflow/python/autograph/g3doc/reference/index.md index 1a1259643bf..28a9b37439f 100644 --- a/tensorflow/python/autograph/g3doc/reference/index.md +++ b/tensorflow/python/autograph/g3doc/reference/index.md @@ -2,6 +2,9 @@ This reference document describes the semantics of AutoGraph transformations. +In `@tf.function`, AutoGraph allows running Eager-style code as a TensorFlow +graph. + * [Introduction](intro.md) * [Interacting with the generated code](generated_code.md) * [Debugging AutoGraph code](debugging.md) @@ -10,3 +13,11 @@ This reference document describes the semantics of AutoGraph transformations. * Exceptions (coming soon) * Builtin Functions (coming soon) * Datasets (coming soon) +* [Limitations](limitations.md) + +For more information on AutoGraph, see the following articles: + +* [AutoGraph tutorial](https://www.tensorflow.org/alpha/guide/autograph) +* [Eager tutorial](https://www.tensorflow.org/alpha/guide/eager) +* [TensorFlow 2.0 Alpha](https://www.tensorflow.org/alpha) +* [AutoGraph blog post](https://medium.com/tensorflow/autograph-converts-python-into-tensorflow-graphs-b2a871f87ec7) diff --git a/tensorflow/python/autograph/g3doc/reference/intro.md b/tensorflow/python/autograph/g3doc/reference/intro.md index 1c720fd2e9f..1de00699590 100644 --- a/tensorflow/python/autograph/g3doc/reference/intro.md +++ b/tensorflow/python/autograph/g3doc/reference/intro.md @@ -4,15 +4,6 @@ ## Introduction -This document describes the semantics of AutoGraph's code transformations. - -For more information on AutoGraph, see the following articles: - -* [AutoGraph tutorial](https://www.tensorflow.org/alpha/guide/autograph) -* [Eager tutorial](https://www.tensorflow.org/alpha/guide/eager) -* [TensorFlow 2.0 Alpha](https://www.tensorflow.org/alpha) -* [AutoGraph blog post](https://medium.com/tensorflow/autograph-converts-python-into-tensorflow-graphs-b2a871f87ec7) - ### Terminology Typically, AutoGraph operates by converting a function into a new function with diff --git a/tensorflow/python/autograph/g3doc/reference/limitations.md b/tensorflow/python/autograph/g3doc/reference/limitations.md new file mode 100644 index 00000000000..dd15d50afcb --- /dev/null +++ b/tensorflow/python/autograph/g3doc/reference/limitations.md @@ -0,0 +1,272 @@ +# AutoGraph reference + +[Index](index.md) + +## Limitations + +When AutoGraph is applied to normal Python code, you should expect no change +in functionality. +However, when applied to TensorFlow control flow (for example, an if statement +with a `tf.Tensor` condition), there are certain limitations. This section +describes these limitations and practices that will allow you to avoid them. + +### Indirect modifications and hidden side effects in TensorFlow control flow + + +Key Point: We recommend using functional style and immutable Python collections. + +#### AutoGraph analyzes code to detect modifications + +One of the most important functions of AutoGraph is to rewrite Python control +flow statements into equivalent TensorFlow ops. This process requires "wiring" +variables in the Python code whose values are affected these statements control +flow into the respective ops. + +Note: Python variables should not be confused with TensorFlow variables. + +The examples below use a `while` loop, but the same notions extend to all +control flow: `if` and `for` statements. + +In the example below, `x` needs to become a _loop variable_ of the +corresponding `tf.while_loop': + +``` +while x > 0: + x = x - 1 +``` +``` +x = tf.while_loop(..., loop_vars=(x,) +``` + +TF control ops support only a limited set of types for loop variable. At the +same time, the efficiency of TensorFlow graphs is influenced by the number of +loop variables, so we don't want to create them unnecessarily. For this reason, +AutoGraph only pulls symbols through loop variables if necessary. + +Note: If a symbol refers to a nested structure, such as a `dict` of `dict`s, +then when that symbol is added to the loop variables the entire structure +becomes part of the loop variables - TensorFlow automatically unpacks it. + +For example, the symbol 'y' below is not wired through the `tf.while_loop`'s +`loop_vars` because it is not affected by the while loop: + +``` +y = 0 +while x > 0: + x = x - 1 +print(y) +``` +``` +x = tf.while_loop(..., loop_vars=(x,) # y does not need to be a loop variable +``` + +AutoGraph uses static analysis to determine which symbols are modified by the +code, in order to transform them into control flow variables. Static analysis +is generally performed on single functions - Python's dynamic nature limits its +effectiveness across functions. + +#### Modifications are not detected across functions + +Because static analysis is limited to single functions, modifications that are +performed in other functions are not visible to AutoGraph: + +``` +def change_y(): + global y + y = y + 1 + +while x > 0: + change_y() # Problem -- change made to y is not visible here! +``` + +This can be easily remedied using functional style - writing functions that take +their inputs as arguments, and return everything they calculate as return +values: + +``` +def change(y): + y = y + 1 + return y + +while x > 0: + y = change(y) # Okay -- y can now be properly tracked! +``` + +#### Modifications are not detected in methods + +A special case of hidden side effects are methods, which are commonly used +to change the value of objects: + +``` +def MyClass(object): + def change(self): + self.y += 1 + +c = MyClass() +while x > 0: + c.change() # Problem -- modification to c.y is not visible here! +``` + +This can be addressed in a number of ways. + +One possibility is to operate directly on the object properties: + +``` +c = MyClass() +while x > 0: + c.y += 1 # Okay -- c.y can now be properly tracked! +``` + +Another possibility is to rely on immutable objects. This may lead to many +temporary objects when executing eagerly, but their number is greatly reduced +in `@tf.function`: + +``` +def MyClass(object): + def change(self): + self.y += 1 + return self + +c = MyClass() +while x > 0: + c = c.change() # Okay -- c is now a loop var. +``` + +Note: TensorFlow control flow does not currently support arbitrary Python +objects, but it does support basic collection objects such as `list`, `dict`, +`tuple`, `namedtuple` and their subclasses. Design your objects as subclasses +of [namedtuple](https://docs.python.org/3/library/collections.html#collections.namedtuple). + +### Python collections in TensorFlow control flow + +Key Point: Use TensorFlow collection classes instead of Python collections. +Python collections are okay to use when they represent a fixed structure (that +is, `list`s don't change length, `dict`s don't add or remove keys). + +#### Modifying Python collections in TensorFlow control flow is not allowed + +One of the advantages of eager execution is that you may use the usual Python +collections, like `list` or `dict` to hold `tf.Tensor` values. However, these +are generally not compatible with TensorFlow control flow. Specialized +collections like `tf.TensorArray` are required. + +Consider the following example: + +``` +def fn(): + l = [] + + def loop_cond(i): + return i < 10 + + def loop_body(i): + i = i + 1 + l.append(i) + return i, + + tf.while_loop( + cond=loop_cond, + body=loop_body, + loop_vars=(0,)) + + return l +``` + +This code works in eager execution, which does not use the TensorFlow runtime +for the `tf.while_loop`: + +``` +fn() +``` + +However, it does not work in graph execution, because TensorFlow uses special +mechanisms to ensure the computations are correctly sequenced in the dataflow +graph: + +``` +tf.function(fn)() # Error -- illegal tensor capture! +``` + +The equivalent AutoGraph code raises the same error: + +``` +l = [] +for i in tf.range(10): + l.append(i) # Error -- illegal tensor capture! +``` + +Instead, use the specialized `tf.TensorArray` class: + +``` +l = tf.TensorArray(tf.int32, size=0, dynamic_size=True) +for i in tf.range(10): + l = l.write(l.size(), i) # Okay +``` + +#### Python collections of fixed structure are allowed TensorFlow control flow + +An exception from the previous rule is made by Python collections that are +static, that is, they don't grow in size for the duration of the computation. + +Caution: Use functional style when manipulating static collections. + +Examples: + +``` +static_list = [tf.constant(3)] +while d.prop > 0: + static_list[0] -= 1 # Okay -- static_list does not change structure +``` +``` +static_object = MyClass() +static_object.field = tf.constant(3) +while static_object.field > 0: + static_object.field -= 1 # Okay -- static_object does not change structure +``` +``` +static_dict = {'field': tf.constant(3)} +while static_dict['field'] > 0: + static_dict['field'] -= 1 # Okay -- static_dict does not change structure +``` + +However, remember to use functional style when these collections are used +inside control flow. + +#### Python collections of fixed structure with dynamic index + +A more subtle error occurs when the collection is static, but is accessed in a +dynamic way, that is with a key that is not constant. + +For example: + +``` +d = {'a': tf.constant(3)} +for i in tf.range(10): + for key in d: + d[key] += i # Problem -- accessing `dict` using non-constant key +``` + +The code above will raises an "illegal capture" error. To remedy it, write it +in functional style: + +``` +d = {'a': tf.constant(3)} +for i in tf.range(10): + d = {key: value + i for key, value in d.items()} # Okay +``` + +### Access to source code + +Key point: AutoGraph can only handle functions whose source code can be +accessed at runtime. + +Almost all Python functions allow access to their source code. However, a few +exceptions exist: + + * functions created in the Python interactive shell + * functions with native bindings (these do not have Python source code) + * functions created dynamically, using `exec` or `eval` + +Use +[inspect.getsource](https://docs.python.org/3/library/inspect.html#inspect.getsource) +to quickly diagnose whether the source code is available for a function. From 27581f164e983c0fac268d60511490e7696ffe4f Mon Sep 17 00:00:00 2001 From: Rick Chao Date: Wed, 24 Jul 2019 15:43:41 -0700 Subject: [PATCH 0517/3053] Fix tensorflow/python/keras:wrappers_test for new Keras single code path. PiperOrigin-RevId: 259832414 --- tensorflow/python/keras/layers/wrappers_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py index 8fe13f4546f..182b729e09c 100644 --- a/tensorflow/python/keras/layers/wrappers_test.py +++ b/tensorflow/python/keras/layers/wrappers_test.py @@ -237,7 +237,8 @@ class TimeDistributedTest(test.TestCase): mask_value=0.,), input_shape=(None, 4))) model.add(keras.layers.TimeDistributed(keras.layers.Dense(5))) model.compile(optimizer='rmsprop', loss='mse') - model_input = np.random.randint(low=1, high=5, size=(10, 3, 4)) + model_input = np.random.randint( + low=1, high=5, size=(10, 3, 4)).astype(np.float32) for i in range(4): model_input[i, i:, :] = 0. model.compile(optimizer='rmsprop', loss='mse') From 0ff6576ad8640888e7b31d55701e26ad2f08fc4e Mon Sep 17 00:00:00 2001 From: Andy Ly Date: Wed, 24 Jul 2019 15:48:17 -0700 Subject: [PATCH 0518/3053] Update control_to_executor_dialect test to use captured targets. PiperOrigin-RevId: 259833315 --- .../mlir/tensorflow/tests/control_to_executor_dialect.mlir | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/control_to_executor_dialect.mlir b/tensorflow/compiler/mlir/tensorflow/tests/control_to_executor_dialect.mlir index b1a9dd71fc7..48f4c8f77df 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/control_to_executor_dialect.mlir +++ b/tensorflow/compiler/mlir/tensorflow/tests/control_to_executor_dialect.mlir @@ -79,7 +79,7 @@ func @LoopTest() { // CHECK-NEXT: %{{[0-9]*}} = "tf.Add"(%[[IDENTITY]]#0, %[[CONST_ADD]]#0) {T = "tfdtype$DT_INT32", device = "", name = "while/Add"} : (tensor<*xi32>, tensor) -> tensor<*xi32> // CHECK-NEXT: tf_executor.yield %{{[0-9]*}} : tensor<*xi32> // CHECK-NEXT: } -// CHECK-NEXT: %[[CT:[0-9]*]] = tf_executor.ControlTrigger %2, %12#1, %9#1 {_tpu_replicate = "cluster", device = "", name = "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync"} +// CHECK-NEXT: %[[CT:[0-9]*]] = tf_executor.ControlTrigger %[[NOOP]], %[[ADD]]#1, %[[EXIT]]#1 {_tpu_replicate = "cluster", device = "", name = "gradients/while/mul_2_Da30D05wlPU_grad/SymbolicGradient/b_sync"} // CHECK-NEXT: tf_executor.NextIteration.Sink [%[[NEXTIT_SRC]]#1] %[[ADD]]#0, %[[CT]] : tensor<*xi32> {T = "tfdtype$DT_INT32", device = "", id = 0 : i64, name = "while/NextIteration"} // CHECK-NEXT: tf_executor.fetch // CHECK-NEXT: } From 9c7ffad45cfc56137ab43ff355ef31d6764a3674 Mon Sep 17 00:00:00 2001 From: Eugene Zhulenev Date: Wed, 24 Jul 2019 16:01:23 -0700 Subject: [PATCH 0519/3053] [Grappler] Cancel multiple Transpose nodes around Pad in one shot PiperOrigin-RevId: 259835724 --- .../optimizers/generic_layout_optimizer.cc | 47 +++++++++++++++++-- .../generic_layout_optimizer_test.cc | 8 +++- 2 files changed, 49 insertions(+), 6 deletions(-) diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc index 38393e14a5c..a33d1888198 100644 --- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc +++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer.cc @@ -40,6 +40,12 @@ constexpr char kNCHW[] = "NCHW"; constexpr float kVoltaGPURatioThreshold = 0.5; constexpr float kConv2DGPUFP16Threshold = 0.5; +struct MutableNodeViewFormatter { + void operator()(std::string* out, utils::MutableNodeView* node_view) const { + absl::StrAppend(out, node_view->node()->name()); + } +}; + inline std::pair GetNumGPUs(const Cluster& cluster) { auto devices = cluster.GetDevices(); int num_gpus = 0; @@ -267,12 +273,17 @@ Status EraseCancellableNodesAroundPad(TransposeContext* context) { utils::MutableGraphView* graph_view = context->graph_view.get(); utils::Mutation* mutation = graph_view->GetMutationBuilder(); + absl::flat_hash_set cancelled_transposes; + const int num_nodes = graph_view->NumNodes(); for (int i = 0; i < num_nodes; ++i) { // Transpose node after Pad. auto* transpose_after = graph_view->GetNode(i); if (!IsTranspose(*transpose_after->node())) continue; + // This transpose was already cancelled in previous loop iteration. + if (cancelled_transposes.contains(transpose_after)) continue; + // Pad node. const auto& transpose_after_fanin = transpose_after->GetRegularFanin(0); auto* pad = transpose_after_fanin.node_view(); @@ -306,10 +317,34 @@ Status EraseCancellableNodesAroundPad(TransposeContext* context) { &permute_t)) continue; - VLOG(0) << "Cancel transpose node pair around pad node:" + // Pad output might be used multiple times by different Transpose nodes. If + // they all have identical permutation, we can cancel all of them. + std::vector pad_fanout_transposes; + pad_fanout_transposes.emplace_back(transpose_after); + + bool pad_has_unsupported_fanout = false; + for (auto& fanout : pad->GetRegularFanout(0)) { + auto* extra_transpose = fanout.node_view(); + if (extra_transpose == transpose_after) continue; + + // Check that fanout is a Transpose identical to the transpose_after. + Tensor extra_permute_t; + if (!GetValueAttrFromConstInputNode(*extra_transpose, IsTranspose, 1, + &extra_permute_t) || + extra_permute_t.tensor_data() != permute_t.tensor_data()) { + pad_has_unsupported_fanout = true; + break; + } + + pad_fanout_transposes.emplace_back(extra_transpose); + } + if (pad_has_unsupported_fanout) continue; + + VLOG(0) << "Cancel Transpose nodes around Pad:" << " transpose_before=" << transpose_before->node()->name() - << " pad=" << pad->node()->name() - << " transpose_after=" << transpose_after->node()->name(); + << " pad=" << pad->node()->name() << " transpose_after=" + << absl::StrJoin(pad_fanout_transposes, ",", + MutableNodeViewFormatter()); // Permute paddings in place according to permutation in second transpose. auto permutation_s = absl::Span(permute_t.flat().data(), @@ -325,14 +360,16 @@ Status EraseCancellableNodesAroundPad(TransposeContext* context) { // Transform Transpose nodes into Identity nodes. const auto transpose_to_identity = - [&mutation](utils::MutableNodeView* transpose) -> void { + [&cancelled_transposes, + &mutation](utils::MutableNodeView* transpose) -> void { mutation->UpdateNodeOp(transpose, "Identity"); mutation->RemoveNodeAttr(transpose, "Tperm"); mutation->RemoveRegularFanin(transpose, 1); + cancelled_transposes.insert(transpose); }; transpose_to_identity(transpose_before); - transpose_to_identity(transpose_after); + absl::c_for_each(pad_fanout_transposes, transpose_to_identity); } return mutation->Apply(); diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc index 3a6316eef25..fd5ae22eac8 100644 --- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc +++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_test.cc @@ -552,6 +552,8 @@ TEST_F(GenericLayoutOptimizerTest, CancelTransposeAroundPad) { {{"T", DT_FLOAT}, {"Tpaddings", DT_INT32}}), NDef("transpose_1", "Transpose", {"pad", "perm_nchw_to_nhwc"}, {{"T", DT_FLOAT}, {"Tperm", DT_INT32}}), + NDef("transpose_2", "Transpose", {"pad", "perm_nchw_to_nhwc"}, + {{"T", DT_FLOAT}, {"Tperm", DT_INT32}}), }); GraphDef output; @@ -575,17 +577,21 @@ TEST_F(GenericLayoutOptimizerTest, CancelTransposeAroundPad) { NDef("pad", "Pad", {"transpose_0", "paddings"}, {{"T", DT_FLOAT}, {"Tpaddings", DT_INT32}}), NDef("transpose_1", "Identity", {"pad"}, {{"T", DT_FLOAT}}), + NDef("transpose_2", "Identity", {"pad"}, {{"T", DT_FLOAT}}), }); CompareGraphs(expected, output); Tensor x = GenerateRandomTensor({2, 6, 6, 8}); - item.fetch = {"transpose_1"}; + item.fetch = {"transpose_1", "transpose_2"}; item.feed.emplace_back("x", x); auto tensors_expected = EvaluateFetchNodes(item); GrapplerItem optimized = item.WithGraph(std::move(output)); auto tensors = EvaluateFetchNodes(optimized); + ASSERT_EQ(tensors.size(), 2); + ASSERT_EQ(tensors_expected.size(), 2); test::ExpectTensorEqual(tensors_expected[0], tensors[0]); + test::ExpectTensorEqual(tensors_expected[1], tensors[1]); } // TODO(yanzha): Add more complex Graph for test. From b8111870ca1e47ccb9e9493b470c0852c6eee250 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 16:09:52 -0700 Subject: [PATCH 0520/3053] Automated rollback of commit 0947898a14b96ce8e13d3c581ffb0d5af9608083 PiperOrigin-RevId: 259837649 --- .../data/experimental/ops/interleave_ops.py | 75 ++---------------- .../python/data/experimental/ops/readers.py | 33 +++----- tensorflow/python/data/ops/readers.py | 79 ++++++++++++++++--- 3 files changed, 81 insertions(+), 106 deletions(-) diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py index 9abf8fb8cb5..9c9645c4947 100644 --- a/tensorflow/python/data/experimental/ops/interleave_ops.py +++ b/tensorflow/python/data/experimental/ops/interleave_ops.py @@ -20,84 +20,20 @@ from __future__ import print_function from tensorflow.python.compat import compat from tensorflow.python.data.experimental.ops import random_ops from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.data.util import convert +from tensorflow.python.data.ops import readers from tensorflow.python.data.util import nest from tensorflow.python.data.util import structure from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_spec from tensorflow.python.ops import array_ops -from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops +from tensorflow.python.ops import gen_experimental_dataset_ops from tensorflow.python.ops import gen_stateless_random_ops from tensorflow.python.ops import math_ops from tensorflow.python.util import deprecation from tensorflow.python.util.tf_export import tf_export -class _ParallelInterleaveDataset(dataset_ops.UnaryDataset): - """A `Dataset` that maps a function over its input and flattens the result.""" - - def __init__(self, input_dataset, map_func, cycle_length, block_length, - sloppy, buffer_output_elements, prefetch_input_elements): - """See `tf.data.experimental.parallel_interleave()` for details.""" - self._input_dataset = input_dataset - self._map_func = dataset_ops.StructuredFunctionWrapper( - map_func, self._transformation_name(), dataset=input_dataset) - if not isinstance(self._map_func.output_structure, dataset_ops.DatasetSpec): - raise TypeError("`map_func` must return a `Dataset` object.") - self._element_spec = self._map_func.output_structure._element_spec # pylint: disable=protected-access - self._cycle_length = ops.convert_to_tensor( - cycle_length, dtype=dtypes.int64, name="cycle_length") - self._block_length = ops.convert_to_tensor( - block_length, dtype=dtypes.int64, name="block_length") - self._sloppy = ops.convert_to_tensor( - sloppy, dtype=dtypes.bool, name="sloppy") - self._buffer_output_elements = convert.optional_param_to_tensor( - "buffer_output_elements", - buffer_output_elements, - argument_default=2 * block_length) - self._prefetch_input_elements = convert.optional_param_to_tensor( - "prefetch_input_elements", - prefetch_input_elements, - argument_default=2 * cycle_length) - # pylint: disable=protected-access - if compat.forward_compatible(2019, 8, 3): - variant_tensor = ged_ops.parallel_interleave_dataset( - self._input_dataset._variant_tensor, - self._map_func.function.captured_inputs, - self._cycle_length, - self._block_length, - self._sloppy, - self._buffer_output_elements, - self._prefetch_input_elements, - f=self._map_func.function, - **self._flat_structure) - else: - variant_tensor = ged_ops.experimental_parallel_interleave_dataset( - self._input_dataset._variant_tensor, - self._map_func.function.captured_inputs, - self._cycle_length, - self._block_length, - self._sloppy, - self._buffer_output_elements, - self._prefetch_input_elements, - f=self._map_func.function, - **self._flat_structure) - # pylint: enable=protected-access - super(_ParallelInterleaveDataset, self).__init__(input_dataset, - variant_tensor) - - def _functions(self): - return [self._map_func] - - @property - def element_spec(self): - return self._element_spec - - def _transformation_name(self): - return "tf.data.experimental.parallel_interleave()" - - @deprecation.deprecated( None, "Use `tf.data.Dataset.interleave(map_func, cycle_length, block_length, " @@ -154,7 +90,7 @@ def parallel_interleave(map_func, `tf.data.Dataset.apply`. """ def _apply_fn(dataset): - return _ParallelInterleaveDataset( + return readers.ParallelInterleaveDataset( dataset, map_func, cycle_length, block_length, sloppy, buffer_output_elements, prefetch_input_elements) @@ -193,13 +129,13 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset): # pylint: disable=protected-access if compat.forward_compatible(2019, 8, 3): return ( - ged_ops.directed_interleave_dataset( + gen_experimental_dataset_ops.directed_interleave_dataset( self._selector_input._variant_tensor, [data_input._variant_tensor for data_input in self._data_inputs], **self._flat_structure)) else: return ( - ged_ops.experimental_directed_interleave_dataset( + gen_experimental_dataset_ops.experimental_directed_interleave_dataset( self._selector_input._variant_tensor, [data_input._variant_tensor for data_input in self._data_inputs], **self._flat_structure)) @@ -358,4 +294,3 @@ choose_from_datasets_v1.__doc__ = choose_from_datasets_v2.__doc__ # these aliases in place. choose_from_datasets = choose_from_datasets_v1 sample_from_datasets = sample_from_datasets_v1 - diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py index 91ebb5245a9..cf8b8c7a13e 100644 --- a/tensorflow/python/data/experimental/ops/readers.py +++ b/tensorflow/python/data/experimental/ops/readers.py @@ -26,6 +26,7 @@ import numpy as np from tensorflow.python.compat import compat from tensorflow.python.data.experimental.ops import batching from tensorflow.python.data.experimental.ops import error_ops +from tensorflow.python.data.experimental.ops import interleave_ops from tensorflow.python.data.experimental.ops import parsing_ops from tensorflow.python.data.experimental.ops import shuffle_ops from tensorflow.python.data.ops import dataset_ops @@ -493,18 +494,9 @@ def make_csv_dataset_v2( return features # Read files sequentially (if num_parallel_reads=1) or in parallel - cycle_length = num_parallel_reads - if num_parallel_reads == dataset_ops.AUTOTUNE: - cycle_length = core_readers.DEFAULT_CYCLE_LENGTH - dataset = dataset.interleave( - filename_to_dataset, - cycle_length, - num_parallel_calls=num_parallel_reads) - - if sloppy: - options = dataset_ops.Options() - options.experimental_deterministic = False - dataset = dataset.with_options(options) + dataset = dataset.apply( + interleave_ops.parallel_interleave( + filename_to_dataset, cycle_length=num_parallel_reads, sloppy=sloppy)) dataset = _maybe_shuffle_and_repeat( dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed) @@ -846,18 +838,11 @@ def make_batched_features_dataset_v2(file_pattern, reader_args = [] # Read files sequentially (if reader_num_threads=1) or in parallel - cycle_length = reader_num_threads - if reader_num_threads == dataset_ops.AUTOTUNE: - cycle_length = core_readers.DEFAULT_CYCLE_LENGTH - dataset = dataset.interleave( - lambda filename: reader(filename, *reader_args), - cycle_length, - num_parallel_calls=reader_num_threads) - - if sloppy_ordering: - options = dataset_ops.Options() - options.experimental_deterministic = False - dataset = dataset.with_options(options) + dataset = dataset.apply( + interleave_ops.parallel_interleave( + lambda filename: reader(filename, *reader_args), + cycle_length=reader_num_threads, + sloppy=sloppy_ordering)) # Extract values if the `Example` tensors are stored as key-value tuples. if dataset_ops.get_legacy_output_types(dataset) == ( diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py index dab33fe2a18..a82f1810e58 100644 --- a/tensorflow/python/data/ops/readers.py +++ b/tensorflow/python/data/ops/readers.py @@ -26,17 +26,13 @@ from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import tensor_spec from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_dataset_ops +from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops from tensorflow.python.util.tf_export import tf_export # TODO(b/64974358): Increase default buffer size to 256 MB. _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024 # 256 KB -# If the user requests the degree of interleave parallelism to be autotuned, -# cycle length controls the maximum level of parallelism. We set it to a small -# constant as a tradeoff between effective parallelism and memory and CPU usage. -DEFAULT_CYCLE_LENGTH = 10 - def _create_or_validate_filenames_dataset(filenames): """Creates (or validates) a dataset of filenames. @@ -84,13 +80,10 @@ def _create_dataset_reader(dataset_creator, filenames, num_parallel_reads=None): if num_parallel_reads is None: return filenames.flat_map(read_one_file) else: - cycle_length = num_parallel_reads - if num_parallel_reads == dataset_ops.AUTOTUNE: - cycle_length = DEFAULT_CYCLE_LENGTH - return filenames.interleave( - read_one_file, - cycle_length, - num_parallel_calls=num_parallel_reads) + return ParallelInterleaveDataset( + filenames, read_one_file, cycle_length=num_parallel_reads, + block_length=1, sloppy=False, buffer_output_elements=None, + prefetch_input_elements=None) class _TextLineDataset(dataset_ops.DatasetSource): @@ -220,6 +213,68 @@ class _TFRecordDataset(dataset_ops.DatasetSource): return tensor_spec.TensorSpec([], dtypes.string) +class ParallelInterleaveDataset(dataset_ops.UnaryDataset): + """A `Dataset` that maps a function over its input and flattens the result.""" + + def __init__(self, input_dataset, map_func, cycle_length, block_length, + sloppy, buffer_output_elements, prefetch_input_elements): + """See `tf.data.experimental.parallel_interleave()` for details.""" + self._input_dataset = input_dataset + self._map_func = dataset_ops.StructuredFunctionWrapper( + map_func, self._transformation_name(), dataset=input_dataset) + if not isinstance(self._map_func.output_structure, dataset_ops.DatasetSpec): + raise TypeError("`map_func` must return a `Dataset` object.") + self._element_spec = self._map_func.output_structure._element_spec # pylint: disable=protected-access + self._cycle_length = ops.convert_to_tensor( + cycle_length, dtype=dtypes.int64, name="cycle_length") + self._block_length = ops.convert_to_tensor( + block_length, dtype=dtypes.int64, name="block_length") + self._sloppy = ops.convert_to_tensor( + sloppy, dtype=dtypes.bool, name="sloppy") + self._buffer_output_elements = convert.optional_param_to_tensor( + "buffer_output_elements", + buffer_output_elements, + argument_default=2 * block_length) + self._prefetch_input_elements = convert.optional_param_to_tensor( + "prefetch_input_elements", + prefetch_input_elements, + argument_default=2 * cycle_length) + if compat.forward_compatible(2019, 8, 3): + variant_tensor = ged_ops.parallel_interleave_dataset( + self._input_dataset._variant_tensor, # pylint: disable=protected-access + self._map_func.function.captured_inputs, + self._cycle_length, + self._block_length, + self._sloppy, + self._buffer_output_elements, + self._prefetch_input_elements, + f=self._map_func.function, + **self._flat_structure) + else: + variant_tensor = ged_ops.experimental_parallel_interleave_dataset( + self._input_dataset._variant_tensor, # pylint: disable=protected-access + self._map_func.function.captured_inputs, + self._cycle_length, + self._block_length, + self._sloppy, + self._buffer_output_elements, + self._prefetch_input_elements, + f=self._map_func.function, + **self._flat_structure) + super(ParallelInterleaveDataset, self).__init__(input_dataset, + variant_tensor) + + def _functions(self): + return [self._map_func] + + @property + def element_spec(self): + return self._element_spec + + def _transformation_name(self): + return "tf.data.experimental.parallel_interleave()" + + @tf_export("data.TFRecordDataset", v1=[]) class TFRecordDatasetV2(dataset_ops.DatasetV2): """A `Dataset` comprising records from one or more TFRecord files.""" From 10d28e7c4251b379c86a3f263e4b849da0f8cc3d Mon Sep 17 00:00:00 2001 From: Karim Nosir Date: Wed, 24 Jul 2019 16:25:32 -0700 Subject: [PATCH 0521/3053] Internal change PiperOrigin-RevId: 259840566 --- tensorflow/lite/build_def.bzl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl index cb98f69ec47..202c3057877 100644 --- a/tensorflow/lite/build_def.bzl +++ b/tensorflow/lite/build_def.bzl @@ -110,6 +110,7 @@ def tflite_jni_binary( linkstatic = 1, testonly = 0, deps = [], + tags = [], srcs = []): """Builds a jni binary for TFLite.""" linkopts = linkopts + select({ @@ -130,6 +131,7 @@ def tflite_jni_binary( linkstatic = linkstatic, deps = deps + [linkscript, exported_symbols], srcs = srcs, + tags = tags, linkopts = linkopts, testonly = testonly, ) From 15867c9e2e44ac0de8ba1640f20005ca076e9a6a Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 16:27:53 -0700 Subject: [PATCH 0522/3053] Updates the Apple and Swift Bazel rules versions. PiperOrigin-RevId: 259840994 --- WORKSPACE | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/WORKSPACE b/WORKSPACE index d5bd495ec4d..d2c65bc1b1d 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -49,9 +49,14 @@ remote_config_workspace() # Apple and Swift rules. http_archive( name = "build_bazel_rules_apple", - sha256 = "23792cd999f97fc97284d1c44cb1324bfdd0bc54aa68ad513fa3705aca3b1f9e", - urls = ["https://github.com/bazelbuild/rules_apple/releases/download/0.15.0/rules_apple.0.15.0.tar.gz"], + sha256 = "6efdde60c91724a2be7f89b0c0a64f01138a45e63ba5add2dca2645d981d23a1", + urls = ["https://github.com/bazelbuild/rules_apple/releases/download/0.17.2/rules_apple.0.17.2.tar.gz"], ) # https://github.com/bazelbuild/rules_apple/releases +http_archive( + name = "build_bazel_rules_swift", + sha256 = "96a86afcbdab215f8363e65a10cf023b752e90b23abf02272c4fc668fcb70311", + urls = ["https://github.com/bazelbuild/rules_swift/releases/download/0.11.1/rules_swift.0.11.1.tar.gz"], +) # https://github.com/bazelbuild/rules_swift/releases http_archive( name = "build_bazel_apple_support", sha256 = "7356dbd44dea71570a929d1d4731e870622151a5f27164d966dda97305f33471", @@ -62,11 +67,6 @@ http_archive( sha256 = "2ef429f5d7ce7111263289644d233707dba35e39696377ebab8b0bc701f7818e", urls = ["https://github.com/bazelbuild/bazel-skylib/releases/download/0.8.0/bazel-skylib.0.8.0.tar.gz"], ) # https://github.com/bazelbuild/bazel-skylib/releases -http_archive( - name = "build_bazel_rules_swift", - sha256 = "9efe9699e9765e6b4a5e063e4a08f6b163cccaf0443f775d935baf5c3cd6ed0e", - urls = ["https://github.com/bazelbuild/rules_swift/releases/download/0.9.0/rules_swift.0.9.0.tar.gz"], -) # https://github.com/bazelbuild/rules_swift/releases http_archive( name = "com_github_apple_swift_swift_protobuf", type = "zip", From a9ee94137705eed8cbbdde1b9d7c38e6f992f433 Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Wed, 24 Jul 2019 16:35:28 -0700 Subject: [PATCH 0523/3053] Update the error messages for illegal use of `Tensor` as `bool` or `iterator` in graph mode and illegal Tensor captures, to be more in line with common practices in TF 2.0. Illegal bool/iter is typically raised when: * using Tensors in control flow like `if` or `while` or `for` statements and for some reason AutoGraph did not pick them up Illegal tensor capture is typically raised when: * python collections are used inside TensorFlow control flow * hidden side effects hide the modification of a value from AutoGraph * Tensor values are stores in global Python variables * functions closed over local Tensor variables The error messages will be inaccurate when users forcefully attempting a bool casting inside `@tf.function`: @tf.function def f(): bool(tf.constant(True)) But this use case is much less likely than the other ones. Note that there is an older code path which seems to capture control flow V1. PiperOrigin-RevId: 259842367 --- tensorflow/python/autograph/impl/api.py | 4 +- tensorflow/python/autograph/impl/api_test.py | 3 +- tensorflow/python/client/session.py | 8 ++- tensorflow/python/eager/def_function_test.py | 3 +- tensorflow/python/framework/errors_impl.py | 8 +++ tensorflow/python/framework/func_graph.py | 19 +++--- tensorflow/python/framework/ops.py | 64 +++++++++++++------ tensorflow/python/keras/engine/base_layer.py | 26 ++++---- .../python/kernel_tests/slice_op_test.py | 4 +- 9 files changed, 90 insertions(+), 49 deletions(-) diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py index d850937423c..c0364f36d45 100644 --- a/tensorflow/python/autograph/impl/api.py +++ b/tensorflow/python/autograph/impl/api.py @@ -100,7 +100,9 @@ class _ErrorMetadata(errors.ErrorMetadataBase): return t( node_def=self.cause.node_def, op=self.cause.op, message=message) - elif preferred_type in (AutoGraphError, ConversionError, StagingError): + elif preferred_type in (AutoGraphError, ConversionError, StagingError, + errors_impl.InaccessibleTensorError, + errors_impl.OperatorNotAllowedInGraphError): return preferred_type(self.get_message()) exc = super(_ErrorMetadata, self).create_exception(preferred_type) diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py index 43330f707f1..0b8f8162036 100644 --- a/tensorflow/python/autograph/impl/api_test.py +++ b/tensorflow/python/autograph/impl/api_test.py @@ -456,8 +456,7 @@ class ApiTest(test.TestCase): # tc is still a TestClass - constructors are whitelisted. # TODO(b/124016764): Support this use case. # The error below is specific to the `if` statement not being converted. - with self.assertRaisesRegex(TypeError, - 'Using a `tf.Tensor` as a Python `bool`'): + with self.assertRaises(TypeError): tc.test_method() def test_converted_call_mangled_properties(self): diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py index 032781e89fc..2ccb7460027 100644 --- a/tensorflow/python/client/session.py +++ b/tensorflow/python/client/session.py @@ -491,8 +491,12 @@ class _FetchHandler(object): def _assert_fetchable(self, graph, op): if not graph.is_fetchable(op): - raise ValueError('Operation %r has been marked as not fetchable.' % - op.name) + raise errors.InaccessibleTensorError( + 'Operation %r has been marked as not fetchable. Typically this' + ' happens when it is defined in another function or code block.' + ' Use return values,explicit Python locals or TensorFlow collections' + ' to access it.' + % op.name) def fetches(self): """Return the unique names of tensors to fetch. diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py index 4a7d6fe4e9e..9ab42b63098 100644 --- a/tensorflow/python/eager/def_function_test.py +++ b/tensorflow/python/eager/def_function_test.py @@ -391,7 +391,8 @@ class DefFunctionTest(test.TestCase): outputs.append(inputs[t]) return outputs - with self.assertRaisesRegexp(ValueError, 'inner'): + with self.assertRaisesRegexp(errors.InaccessibleTensorError, + 'defined in another function or code block'): f(array_ops.zeros(shape=(8, 42, 3))) def testRuntimeErrorNotSticky(self): diff --git a/tensorflow/python/framework/errors_impl.py b/tensorflow/python/framework/errors_impl.py index fbdc2aaa0ea..caaeab40254 100644 --- a/tensorflow/python/framework/errors_impl.py +++ b/tensorflow/python/framework/errors_impl.py @@ -46,6 +46,14 @@ def _compact_stack_trace(op): return compact_traces +class InaccessibleTensorError(ValueError): + pass + + +class OperatorNotAllowedInGraphError(TypeError): + pass + + @tf_export("errors.OpError", v1=["errors.OpError", "OpError"]) @deprecation.deprecated_endpoints("OpError") class OpError(Exception): diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py index f747110f318..fc7b8461706 100644 --- a/tensorflow/python/framework/func_graph.py +++ b/tensorflow/python/framework/func_graph.py @@ -29,6 +29,7 @@ from tensorflow.python.eager import tape from tensorflow.python.eager.graph_only_ops import graph_placeholder from tensorflow.python.framework import composite_tensor from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_spec from tensorflow.python.framework.auto_control_deps import AutomaticControlDependencies @@ -546,6 +547,11 @@ class FuncGraph(ops.Graph): Returns: Tensor from this FuncGraph. + + Raises: + InaccessibleTensorError: if any tensors are accessed in a manner that + bypasses the mechanisms required for the data dependencies to be correctly + wired. """ # Note: _forward_func_graph is currently only set when building the gradient # graph graph of a defun call. If the backwards graph tries to capture @@ -578,14 +584,11 @@ class FuncGraph(ops.Graph): inner_graph = tensor.graph while inner_graph is not None and isinstance(inner_graph, FuncGraph): if inner_graph is self: - raise ValueError( - "Trying to capture a tensor from an inner function. This can be " - "caused by accessing a tensor defined inside a loop or " - "conditional body, or a subfunction, from a calling function, " - "without going through the proper return value mechanism. " - "Consider using TensorFlow mechanisms such as TensorArrays " - "to return tensors from inner functions or loop / conditional " - "bodies. Tensor: %s; tensor graph: %s; this graph: %s" + raise errors.InaccessibleTensorError( + "The tensor '%s' cannot be accessed here: it is defined" + " in another function or code block. Use return values," + " explicit Python locals or TensorFlow collections to access" + " it. Defined in: %s; accessed from: %s.\n" % (tensor, tensor.graph, self)) inner_graph = inner_graph.outer_graph return self._capture_helper(tensor, name) diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index a20cc832232..61688e5c8bc 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -66,8 +66,13 @@ from tensorflow.python.util import memory from tensorflow.python.util import tf_contextlib from tensorflow.python.util import tf_stack from tensorflow.python.util.deprecation import deprecated_args +from tensorflow.python.util.lazy_loader import LazyLoader from tensorflow.python.util.tf_export import tf_export +ag_ctx = LazyLoader( + "ag_ctx", globals(), + "tensorflow.python.autograph.core.ag_ctx") + # Temporary global switches determining if we should enable the work-in-progress # calls to the C API. These will be removed once all functionality is supported. @@ -500,11 +505,45 @@ class Tensor(_TensorLike): raise ValueError( "Tensor._shape cannot be assigned, use Tensor.set_shape instead.") + def _disallow_when_autograph_disabled(self, task): + raise errors.OperatorNotAllowedInGraphError( + "{} is not allowed: AutoGraph is disabled in this function." + " Try decorating it directly with @tf.function.".format(task)) + + def _disallow_when_autograph_enabled(self, task): + raise errors.OperatorNotAllowedInGraphError( + "{} is not allowed: AutoGraph did not convert this function. Try" + " decorating it directly with @tf.function.".format(task)) + + def _disallow_in_graph_mode(self, task): + raise errors.OperatorNotAllowedInGraphError( + "{} is not allowed in Graph execution. Use Eager execution or decorate" + " this function with @tf.function.".format(task)) + + def _disallow_bool_casting(self): + if ag_ctx.control_status_ctx().status == ag_ctx.Status.DISABLED: + self._disallow_when_autograph_disabled( + "using a `tf.Tensor` as a Python `bool`") + elif ag_ctx.control_status_ctx().status == ag_ctx.Status.ENABLED: + self._disallow_when_autograph_disabled( + "using a `tf.Tensor` as a Python `bool`") + else: + # Default: V1-style Graph execution. + self._disallow_in_graph_mode("using a `tf.Tensor` as a Python `bool`") + + def _disallow_iteration(self): + if ag_ctx.control_status_ctx().status == ag_ctx.Status.DISABLED: + self._disallow_when_autograph_enabled("iterating over `tf.Tensor`") + elif ag_ctx.control_status_ctx().status == ag_ctx.Status.ENABLED: + self._disallow_when_autograph_enabled("iterating over `tf.Tensor`") + else: + # Default: V1-style Graph execution. + self._disallow_in_graph_mode("iterating over `tf.Tensor`") + def __iter__(self): if not context.executing_eagerly(): - raise TypeError( - "Tensor objects are only iterable when eager execution is " - "enabled. To iterate over this tensor use tf.map_fn.") + self._disallow_iteration() + shape = self._shape_tuple() if shape is None: raise TypeError("Cannot iterate over a tensor with unknown shape.") @@ -695,8 +734,8 @@ class Tensor(_TensorLike): """Dummy method to prevent a tensor from being used as a Python `bool`. This overload raises a `TypeError` when the user inadvertently - treats a `Tensor` as a boolean (e.g. in an `if` statement). For - example: + treats a `Tensor` as a boolean (most commonly in an `if` or `while` + statement), in code that was not converted by AutoGraph. For example: ```python if tf.constant(True): # Will raise. @@ -706,17 +745,10 @@ class Tensor(_TensorLike): # ... ``` - This disallows ambiguities between testing the Python value vs testing the - dynamic condition of the `Tensor`. - Raises: `TypeError`. """ - raise TypeError("Using a `tf.Tensor` as a Python `bool` is not allowed. " - "Use `if t is not None:` instead of `if t:` to test if a " - "tensor is defined, and use TensorFlow ops such as " - "tf.cond to execute subgraphs conditioned on the value of " - "a tensor.") + self._disallow_bool_casting() def __nonzero__(self): """Dummy method to prevent a tensor from being used as a Python `bool`. @@ -726,11 +758,7 @@ class Tensor(_TensorLike): Raises: `TypeError`. """ - raise TypeError("Using a `tf.Tensor` as a Python `bool` is not allowed. " - "Use `if t is not None:` instead of `if t:` to test if a " - "tensor is defined, and use TensorFlow ops such as " - "tf.cond to execute subgraphs conditioned on the value of " - "a tensor.") + self._disallow_bool_casting() def eval(self, feed_dict=None, session=None): """Evaluates this tensor in a `Session`. diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py index b193f092374..5fd7f98a776 100644 --- a/tensorflow/python/keras/engine/base_layer.py +++ b/tensorflow/python/keras/engine/base_layer.py @@ -38,6 +38,7 @@ from tensorflow.python.eager import function from tensorflow.python.framework import auto_control_deps from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.framework import func_graph from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_spec @@ -710,16 +711,12 @@ class Layer(module.Module): else: outputs = call_fn(inputs, *args, **kwargs) - except TypeError as e: - exception_str = str(e) - exception_msg = 'Tensor objects are only iterable when eager' - if exception_msg in exception_str: - raise TypeError('You are attempting to use Python control ' - 'flow in a layer that was not declared to be ' - 'dynamic. Pass `dynamic=True` to the class ' - 'constructor.\nEncountered error:\n"""\n' + - exception_str + '\n"""') - raise + except errors.OperatorNotAllowedInGraphError as e: + raise TypeError('You are attempting to use Python control ' + 'flow in a layer that was not declared to be ' + 'dynamic. Pass `dynamic=True` to the class ' + 'constructor.\nEncountered error:\n"""\n' + + str(e) + '\n"""') else: # We will use static shape inference to return symbolic tensors # matching the specifications of the layer outputs. @@ -844,11 +841,10 @@ class Layer(module.Module): if callable(u): try: u = u() - except ValueError as e: - if 'Trying to capture a tensor from an inner function' in str(e): - base_layer_utils.check_graph_consistency( - method='add_update', force_raise=True) - raise + except errors.InaccessibleTensorError: + base_layer_utils.check_graph_consistency( + method='add_update', force_raise=True) + raise # check_graph_consistency may not always raise. base_layer_utils.check_graph_consistency(u, method='add_update') updates.append(u) return updates + self._gather_children_attribute('updates') diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py index 8f7245214a2..258b39b3fb5 100644 --- a/tensorflow/python/kernel_tests/slice_op_test.py +++ b/tensorflow/python/kernel_tests/slice_op_test.py @@ -348,8 +348,8 @@ class SliceTest(test.TestCase): # Tensor from 0 to infinity. This test ensures that this # unintended behavior is prevented. c = constant_op.constant(5.0) - with self.assertRaisesWithPredicateMatch( - TypeError, lambda e: "Tensor objects are only iterable" in str(e)): + with self.assertRaisesRegex(errors_impl.OperatorNotAllowedInGraphError, + "iterating over `tf.Tensor`"): for _ in c: pass From eeb01a52d82e9b7fd55aaee4a81f9ba562ff0f4b Mon Sep 17 00:00:00 2001 From: "Xiaoming (Jason) Cui" Date: Wed, 24 Jul 2019 16:44:59 -0700 Subject: [PATCH 0524/3053] [INTEL MKL] Simplified description of the helper function in the test tensorflow/python/debug/cli/analyzer_cli_test.py --- tensorflow/python/debug/cli/analyzer_cli_test.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py index 982fccfd58c..bf3a6157720 100644 --- a/tensorflow/python/debug/cli/analyzer_cli_test.py +++ b/tensorflow/python/debug/cli/analyzer_cli_test.py @@ -46,17 +46,9 @@ from tensorflow.python.platform import googletest from tensorflow.python.platform import test from tensorflow.python.util import tf_inspect -# There are two types MKL supported operators. One type operators whose kernels -# understand MKL layout in input tensors, # (e.g., MklConv2D, etc.) we -# registered them with 'MklLayoutDependentOp' label. The other operators whose -# kernels don't understand input tensors with MKL layout. # (e.g., MklMatMul, -# MklTranspose), we registered them with 'MklNameChangeOp' label. With those -# operators registered as 'MklNameChangeOp' operators, we go through a name -# change during graph rewrite pass, and we changed the name of operators by -# adding "Mkl" before their original name. In this test, only MatMul is -# affected. We add this function to automatically change the operator's name -# 'MatMul' to 'MklMatMul' when the test is running with MKL enabled TensorFlow, -# so that the test can pass. +# Helper function to accommodate MKL-enabled TensorFlow: +# MatMul op is supported by MKL and its name is prefixed with "_Mkl" during the +# MKL graph rewrite pass. def matmul_op_name(): return "_MklMatMul" if test_util.IsMklEnabled() else "MatMul" From fa5fc003b592e667b3bd106daf493295e1cc559d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 16:36:21 -0700 Subject: [PATCH 0525/3053] Add REDUCE_* support to the NNAPI delegate Also update LSH_PROJECTION to properly support sparse projection on Q+. And add check for quantization parameter for MEAN. PiperOrigin-RevId: 259842530 --- .../lite/delegates/nnapi/nnapi_delegate.cc | 124 +++++++++++++++++- tensorflow/lite/kernels/BUILD | 1 + tensorflow/lite/nnapi/NeuralNetworksTypes.h | 5 + 3 files changed, 126 insertions(+), 4 deletions(-) diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc index 4b4737c9084..837ae62f2bd 100644 --- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc +++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h" #include +#include #include #include #include @@ -142,7 +143,9 @@ bool NeedInt8Conversion(const TfLiteContext* context, int builtin_code, } case kTfLiteBuiltinL2Normalization: case kTfLiteBuiltinSub: - case kTfLiteBuiltinTanh: { + case kTfLiteBuiltinTanh: + case kTfLiteBuiltinReduceMin: + case kTfLiteBuiltinReduceMax: { return input_type == kTfLiteInt8; } default: @@ -1292,16 +1295,31 @@ class NNAPIDelegateKernel { break; case kTfLiteBuiltinLshProjection: if (version == 1) { - // NNAPI does not support sparse projection correctly (b/111751836). if (reinterpret_cast(node->builtin_data) ->type == kTfLiteLshProjectionSparse) { - return nullptr; + // NNAPI does not support sparse projection correctly pre-Q + // (b/111751836). + if (android_sdk_version < kMinSdkVersionForNNAPI12) { + return nullptr; + } + // NNAPI does not support weights for sparse projects. + if (node->inputs->size != 2) { + return nullptr; + } } return [](const NNAPIOpMappingArgs& mapping_args) -> ANeuralNetworksOperationType { auto builtin = reinterpret_cast( mapping_args.node->builtin_data); - mapping_args.builder->AddScalarInt32Operand(builtin->type); + int type = builtin->type; + // In Android Q+, NNAPI uses 3 to denote kTfLiteLshProjectionSparse. + const int kNNAPILshProjectionSparse = 3; + if (builtin->type == kTfLiteLshProjectionSparse) { + type = kNNAPILshProjectionSparse; + // Add NNAPI null weight operand. + mapping_args.builder->AddVectorFloat32Operand(nullptr, 0); + } + mapping_args.builder->AddScalarInt32Operand(type); return ANEURALNETWORKS_LSH_PROJECTION; }; } @@ -1707,6 +1725,14 @@ class NNAPIDelegateKernel { (android_sdk_version >= kMinSdkVersionForNNAPI12 && context->tensors[node->inputs->data[0]].type == kTfLiteUInt8)) && context->tensors[node->outputs->data[0]].dims->size > 0) { + auto input_param = context->tensors[node->inputs->data[0]].params; + auto output_param = context->tensors[node->outputs->data[0]].params; + // NNAPI requires that the input and output have the same + // quantization parameters. + if (input_param.scale != output_param.scale || + input_param.zero_point != output_param.zero_point) { + return nullptr; + } return [](const NNAPIOpMappingArgs& mapping_args) -> ANeuralNetworksOperationType { auto builtin = reinterpret_cast( @@ -2027,6 +2053,96 @@ class NNAPIDelegateKernel { return BasicMappingFn; } } break; + case kTfLiteBuiltinReduceAny: { + if (version != 1 || android_sdk_version < kMinSdkVersionForNNAPI12) { + return nullptr; + } + // NNAPI does not support generating a scalar as output for REDUCE_ANY. + if (context->tensors[node->outputs->data[0]].dims->size == 0) { + return nullptr; + } + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarBoolOperand(builtin->keep_dims); + return ANEURALNETWORKS_REDUCE_ANY; + }; + } break; + case kTfLiteBuiltinReduceMin: { + if (version != 1 || android_sdk_version < kMinSdkVersionForNNAPI12) { + return nullptr; + } + // NNAPI does not support generating a scalar as output for REDUCE_MIN. + if (context->tensors[node->outputs->data[0]].dims->size == 0) { + return nullptr; + } + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarBoolOperand(builtin->keep_dims); + return ANEURALNETWORKS_REDUCE_MIN; + }; + } break; + case kTfLiteBuiltinReduceMax: { + if (version != 1 || android_sdk_version < kMinSdkVersionForNNAPI12) { + return nullptr; + } + // NNAPI does not support generating a scalar as output for REDUCE_MAX. + if (context->tensors[node->outputs->data[0]].dims->size == 0) { + return nullptr; + } + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarBoolOperand(builtin->keep_dims); + return ANEURALNETWORKS_REDUCE_MAX; + }; + } break; + case kTfLiteBuiltinReduceProd: { + if (version != 1 || android_sdk_version < kMinSdkVersionForNNAPI12) { + return nullptr; + } + // NNAPI only supports floating point REDUCE_PROD. + const auto input_type = context->tensors[node->inputs->data[0]].type; + if (input_type != kTfLiteFloat32) { + return nullptr; + } + // NNAPI does not support generating a scalar as output for REDUCE_PROD. + if (context->tensors[node->outputs->data[0]].dims->size == 0) { + return nullptr; + } + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarBoolOperand(builtin->keep_dims); + return ANEURALNETWORKS_REDUCE_PROD; + }; + } break; + case kTfLiteBuiltinSum: { + if (version != 1 || android_sdk_version < kMinSdkVersionForNNAPI12) { + return nullptr; + } + // NNAPI only supports floating point REDUCE_SUM. + const auto input_type = context->tensors[node->inputs->data[0]].type; + if (input_type != kTfLiteFloat32) { + return nullptr; + } + // NNAPI does not support generating a scalar as output for REDUCE_SUM. + if (context->tensors[node->outputs->data[0]].dims->size == 0) { + return nullptr; + } + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + auto builtin = reinterpret_cast( + mapping_args.node->builtin_data); + mapping_args.builder->AddScalarBoolOperand(builtin->keep_dims); + return ANEURALNETWORKS_REDUCE_SUM; + }; + } break; default: // All other operators are not mapped. return nullptr; diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD index 2b550c95f08..9afe0c8a4e6 100644 --- a/tensorflow/lite/kernels/BUILD +++ b/tensorflow/lite/kernels/BUILD @@ -979,6 +979,7 @@ cc_test( name = "reduce_test", size = "small", srcs = ["reduce_test.cc"], + tags = ["tflite_nnapi"], deps = [ ":builtin_ops", ":test_main", diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h index 6b5d8e241e4..fc8d2486837 100644 --- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h +++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h @@ -115,6 +115,11 @@ enum { ANEURALNETWORKS_POW = 70, ANEURALNETWORKS_PRELU = 71, ANEURALNETWORKS_QUANTIZE = 72, + ANEURALNETWORKS_REDUCE_ANY = 76, + ANEURALNETWORKS_REDUCE_MAX = 77, + ANEURALNETWORKS_REDUCE_MIN = 78, + ANEURALNETWORKS_REDUCE_PROD = 79, + ANEURALNETWORKS_REDUCE_SUM = 80, ANEURALNETWORKS_RSQRT = 83, ANEURALNETWORKS_SELECT = 84, ANEURALNETWORKS_SIN = 85, From 1f555ad942f916692e6d4c624ce087f50db6a2f7 Mon Sep 17 00:00:00 2001 From: Ashwin Murthy Date: Wed, 24 Jul 2019 16:40:28 -0700 Subject: [PATCH 0526/3053] [TFLite] Add a test for flatbuffer export of unidirectional_sequence_lstm op PiperOrigin-RevId: 259843303 --- .../unidirectional_sequence_lstm.mlir | 284 ++++++++++++++++++ 1 file changed, 284 insertions(+) create mode 100644 tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir new file mode 100644 index 00000000000..6c1532663d5 --- /dev/null +++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/unidirectional_sequence_lstm.mlir @@ -0,0 +1,284 @@ +// RUN: flatbuffer_translate -mlir-to-tflite-flatbuffer %s -o - | flatbuffer_to_string - | FileCheck %s + +func @main(tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>, tensor<4 x f32>) -> tensor<4 x f32> { +// CHECK: { +// CHECK-NEXT: version: 3, +// CHECK-NEXT: operator_codes: [ { +// CHECK-NEXT: builtin_code: UNIDIRECTIONAL_SEQUENCE_LSTM +// CHECK-NEXT: } ], +// CHECK-NEXT: subgraphs: [ { +// CHECK-NEXT: tensors: [ { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 1, +// CHECK-NEXT: name: "tfl.pseudo_input", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 2, +// CHECK-NEXT: name: "tfl.pseudo_input1", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 3, +// CHECK-NEXT: name: "tfl.pseudo_input2", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 4, +// CHECK-NEXT: name: "tfl.pseudo_input3", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 5, +// CHECK-NEXT: name: "tfl.pseudo_input4", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 6, +// CHECK-NEXT: name: "tfl.pseudo_input5", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 7, +// CHECK-NEXT: name: "tfl.pseudo_input6", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 8, +// CHECK-NEXT: name: "tfl.pseudo_input7", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 9, +// CHECK-NEXT: name: "tfl.pseudo_input8", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 10, +// CHECK-NEXT: name: "tfl.pseudo_input9", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 11, +// CHECK-NEXT: name: "tfl.pseudo_input10", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 12, +// CHECK-NEXT: name: "tfl.pseudo_input11", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 13, +// CHECK-NEXT: name: "tfl.pseudo_input12", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 14, +// CHECK-NEXT: name: "tfl.pseudo_input13", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 15, +// CHECK-NEXT: name: "tfl.pseudo_input14", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 16, +// CHECK-NEXT: name: "tfl.pseudo_input15", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 17, +// CHECK-NEXT: name: "tfl.pseudo_input16", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 18, +// CHECK-NEXT: name: "tfl.pseudo_input17", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 19, +// CHECK-NEXT: name: "tfl.pseudo_input18", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: }, +// CHECK-NEXT: is_variable: true +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 20, +// CHECK-NEXT: name: "tfl.pseudo_input19", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: }, +// CHECK-NEXT: is_variable: true +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 21, +// CHECK-NEXT: name: "tfl.pseudo_input20", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 22, +// CHECK-NEXT: name: "tfl.pseudo_input21", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 23, +// CHECK-NEXT: name: "tfl.pseudo_input22", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 24, +// CHECK-NEXT: name: "tfl.pseudo_input23", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: }, { +// CHECK-NEXT: shape: [ 4 ], +// CHECK-NEXT: buffer: 25, +// CHECK-NEXT: name: "tfl.unidirectional_sequence_lstm", +// CHECK-NEXT: quantization: { +// CHECK-EMPTY: +// CHECK-NEXT: } +// CHECK-NEXT: } ], +// CHECK-NEXT: inputs: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 ], +// CHECK-NEXT: outputs: [ 24 ], +// CHECK-NEXT: operators: [ { +// CHECK-NEXT: inputs: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23 ], +// CHECK-NEXT: outputs: [ 24 ], +// CHECK-NEXT: builtin_options_type: UnidirectionalSequenceLSTMOptions, +// CHECK-NEXT: builtin_options: { +// CHECK-NEXT: time_major: true +// CHECK-NEXT: } +// CHECK-NEXT: } ], +// CHECK-NEXT: name: "main" +// CHECK-NEXT: } ], +// CHECK-NEXT: description: "MLIR Converted.", +// CHECK-NEXT: buffers: [ { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: }, { +// CHECK-EMPTY: +// CHECK-NEXT: } ] +// CHECK-NEXT: } +// CHECK-EMPTY: + +^bb0(%arg0: tensor<4 x f32>, %arg1: tensor<4 x f32>, %arg2: tensor<4 x f32>, %arg3: tensor<4 x f32>, %arg4: tensor<4 x f32>, %arg5: tensor<4 x f32>, %arg6: tensor<4 x f32>, %arg7: tensor<4 x f32>, %arg8: tensor<4 x f32>, %arg9: tensor<4 x f32>, %arg10: tensor<4 x f32>, %arg11: tensor<4 x f32>, %arg12: tensor<4 x f32>, %arg13: tensor<4 x f32>, %arg14: tensor<4 x f32>, %arg15: tensor<4 x f32>, %arg16: tensor<4 x f32>, %arg17: tensor<4 x f32>, %arg18: tensor<4 x f32>, %arg19: tensor<4 x f32>, %arg20: tensor<4 x f32>, %arg21: tensor<4 x f32>, %arg22: tensor<4 x f32>, %arg23: tensor<4 x f32>): + %0 = "tfl.pseudo_input" (%arg0) : (tensor<4 x f32>) -> tensor<4 x f32> + %1 = "tfl.pseudo_input" (%arg1) : (tensor<4 x f32>) -> tensor<4 x f32> + %2 = "tfl.pseudo_input" (%arg2) : (tensor<4 x f32>) -> tensor<4 x f32> + %3 = "tfl.pseudo_input" (%arg3) : (tensor<4 x f32>) -> tensor<4 x f32> + %4 = "tfl.pseudo_input" (%arg4) : (tensor<4 x f32>) -> tensor<4 x f32> + %5 = "tfl.pseudo_input" (%arg5) : (tensor<4 x f32>) -> tensor<4 x f32> + %6 = "tfl.pseudo_input" (%arg6) : (tensor<4 x f32>) -> tensor<4 x f32> + %7 = "tfl.pseudo_input" (%arg7) : (tensor<4 x f32>) -> tensor<4 x f32> + %8 = "tfl.pseudo_input" (%arg8) : (tensor<4 x f32>) -> tensor<4 x f32> + %9 = "tfl.pseudo_input" (%arg9) : (tensor<4 x f32>) -> tensor<4 x f32> + %10 = "tfl.pseudo_input" (%arg10) : (tensor<4 x f32>) -> tensor<4 x f32> + %11 = "tfl.pseudo_input" (%arg11) : (tensor<4 x f32>) -> tensor<4 x f32> + %12 = "tfl.pseudo_input" (%arg12) : (tensor<4 x f32>) -> tensor<4 x f32> + %13 = "tfl.pseudo_input" (%arg13) : (tensor<4 x f32>) -> tensor<4 x f32> + %14 = "tfl.pseudo_input" (%arg14) : (tensor<4 x f32>) -> tensor<4 x f32> + %15 = "tfl.pseudo_input" (%arg15) : (tensor<4 x f32>) -> tensor<4 x f32> + %16 = "tfl.pseudo_input" (%arg16) : (tensor<4 x f32>) -> tensor<4 x f32> + %17 = "tfl.pseudo_input" (%arg17) : (tensor<4 x f32>) -> tensor<4 x f32> + %18 = "tfl.pseudo_input" (%arg18) : (tensor<4 x f32>) -> tensor<4 x f32> + %19 = "tfl.pseudo_input" (%arg19) : (tensor<4 x f32>) -> tensor<4 x f32> + %20 = "tfl.pseudo_input" (%arg20) : (tensor<4 x f32>) -> tensor<4 x f32> + %21 = "tfl.pseudo_input" (%arg21) : (tensor<4 x f32>) -> tensor<4 x f32> + %22 = "tfl.pseudo_input" (%arg22) : (tensor<4 x f32>) -> tensor<4 x f32> + %23 = "tfl.pseudo_input" (%arg23) : (tensor<4 x f32>) -> tensor<4 x f32> + %24 = "tfl.unidirectional_sequence_lstm"(%0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16, %17, %18, %19, %20, %21, %22, %23) {fused_activation_function = "NONE", time_major = true} : (tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>) -> tensor<4xf32> + return %24 : tensor<4xf32> +} \ No newline at end of file From 12a9859437cd8db105701e64cfbd60961184bcba Mon Sep 17 00:00:00 2001 From: Pavithra Vijay Date: Wed, 24 Jul 2019 17:13:55 -0700 Subject: [PATCH 0527/3053] Fix autograph comment in base layer. PiperOrigin-RevId: 259849617 --- tensorflow/python/keras/engine/base_layer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py index 5fd7f98a776..7444189b212 100644 --- a/tensorflow/python/keras/engine/base_layer.py +++ b/tensorflow/python/keras/engine/base_layer.py @@ -684,8 +684,9 @@ class Layer(module.Module): self._maybe_build(inputs) # Wrapping `call` function in autograph to allow for dynamic control - # dependencies in call. We are limiting this to subclassed layers as - # autograph is strictly needed only for subclassed layers and models. + # flow and control dependencies in call. We are limiting this to + # subclassed layers as autograph is strictly needed only for + # subclassed layers and models. # tf_convert will respect the value of autograph setting in the # enclosing tf.function, if any. if base_layer_utils.is_subclassed(self): From a2adb0f8ce43ec788646b69745387be06ac207ed Mon Sep 17 00:00:00 2001 From: Jared Duke Date: Wed, 24 Jul 2019 17:17:35 -0700 Subject: [PATCH 0528/3053] Add Interpreter.resetVariableTensors() binding to Java API Note that this API is experimental, like the C++ API. PiperOrigin-RevId: 259850191 --- .../java/org/tensorflow/lite/Interpreter.java | 12 ++++++++++++ .../lite/NativeInterpreterWrapper.java | 6 ++++++ .../native/nativeinterpreterwrapper_jni.cc | 19 +++++++++++++++++++ .../org/tensorflow/lite/InterpreterTest.java | 17 +++++++++++++++++ 4 files changed, 54 insertions(+) diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java index 5aef4fb0572..37f8b38012d 100644 --- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java +++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java @@ -388,6 +388,18 @@ public final class Interpreter implements AutoCloseable { wrapper.modifyGraphWithDelegate(delegate); } + /** + * Advanced: Resets all variable tensors to the default value. + * + *

If a variable tensor doesn't have an associated buffer, it will be reset to zero. + * + *

WARNING: This is an experimental API and subject to change. + */ + public void resetVariableTensors() { + checkNotClosed(); + wrapper.resetVariableTensors(); + } + /** Release resources associated with the {@code Interpreter}. */ @Override public void close() { diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java index 160d4df2783..abe0ec7af86 100644 --- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java +++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java @@ -193,6 +193,10 @@ final class NativeInterpreterWrapper implements AutoCloseable { delegates.add(delegate); } + void resetVariableTensors() { + resetVariableTensors(interpreterHandle, errorHandle); + } + /** Gets index of an input given its name. */ int getInputIndex(String name) { if (inputsIndexes == null) { @@ -374,6 +378,8 @@ final class NativeInterpreterWrapper implements AutoCloseable { private static native void applyDelegate( long interpreterHandle, long errorHandle, long delegateHandle); + private static native void resetVariableTensors(long interpreterHandle, long errorHandle); + private static native void delete(long errorHandle, long modelHandle, long interpreterHandle); static { diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc index c2abbab1240..b86509788b0 100644 --- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc +++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc @@ -508,6 +508,25 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_applyDelegate( } } +JNIEXPORT void JNICALL +Java_org_tensorflow_lite_NativeInterpreterWrapper_resetVariableTensors( + JNIEnv* env, jclass clazz, jlong interpreter_handle, jlong error_handle) { + tflite::Interpreter* interpreter = + convertLongToInterpreter(env, interpreter_handle); + if (interpreter == nullptr) return; + + BufferErrorReporter* error_reporter = + convertLongToErrorReporter(env, error_handle); + if (error_reporter == nullptr) return; + + TfLiteStatus status = interpreter->ResetVariableTensors(); + if (status != kTfLiteOk) { + ThrowException(env, kIllegalArgumentException, + "Internal error: Failed to reset variable tensors: %s", + error_reporter->CachedErrorMessage()); + } +} + JNIEXPORT void JNICALL Java_org_tensorflow_lite_NativeInterpreterWrapper_delete( JNIEnv* env, jclass clazz, jlong error_handle, jlong model_handle, jlong interpreter_handle) { diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java index d62b1e194a1..6f22764abeb 100644 --- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java +++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java @@ -479,6 +479,23 @@ public final class InterpreterTest { } } + @Test + public void testResetVariableTensors() throws Exception { + float[][][][] inputs = new float[2][8][8][3]; + float[][][][] parsedOutputs = new float[2][8][8][3]; + + // Smoke test to ensure resetting variables at various times in a simple graph doesn't fail. + // TODO(b/138197256): Test with model that has variables. + try (Interpreter interpreter = new Interpreter(MODEL_BUFFER)) { + interpreter.resetVariableTensors(); + interpreter.run(inputs, parsedOutputs); + + interpreter.resetVariableTensors(); + interpreter.resetVariableTensors(); + interpreter.run(inputs, parsedOutputs); + } + } + private static native long getNativeHandleForDelegate(); private static native long getNativeHandleForInvalidDelegate(); From eb76f680ef61358b814ad47ad4897027387d32c5 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 17:22:42 -0700 Subject: [PATCH 0529/3053] Removes TensorFlowLiteC framework version. Version is specified in the podspec. PiperOrigin-RevId: 259850994 --- tensorflow/lite/experimental/ios/BUILD.apple | 9 +-------- tensorflow/lite/experimental/ios/ios.bzl | 3 --- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple index 2d78b2163d5..24d975cb9a0 100644 --- a/tensorflow/lite/experimental/ios/BUILD.apple +++ b/tensorflow/lite/experimental/ios/BUILD.apple @@ -1,19 +1,13 @@ # TensorFlow Lite for iOS -load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_IOS_BUILD_VERSION", "TFL_MINIMUM_OS_VERSION") +load("//tensorflow/lite/experimental/ios:ios.bzl", "TFL_MINIMUM_OS_VERSION") load("@build_bazel_rules_apple//apple:ios.bzl", "ios_static_framework") -load("@build_bazel_rules_apple//apple:versioning.bzl", "apple_bundle_version") package( default_visibility = ["//visibility:private"], licenses = ["notice"], # Apache 2.0 ) -apple_bundle_version( - name = "TensorFlowLiteC_version", - build_version = TFL_IOS_BUILD_VERSION, -) - ios_static_framework( name = "TensorFlowLiteC_framework", hdrs = [ @@ -22,6 +16,5 @@ ios_static_framework( ], bundle_name = "TensorFlowLiteC", minimum_os_version = TFL_MINIMUM_OS_VERSION, - version = ":TensorFlowLiteC_version", deps = ["//tensorflow/lite/experimental/c:c_api"], ) diff --git a/tensorflow/lite/experimental/ios/ios.bzl b/tensorflow/lite/experimental/ios/ios.bzl index 1698134fb1d..976c6b09a97 100644 --- a/tensorflow/lite/experimental/ios/ios.bzl +++ b/tensorflow/lite/experimental/ios/ios.bzl @@ -1,8 +1,5 @@ """TensorFlow Lite Build Configurations for iOS""" -# Current version of the TensorFlow Lite iOS libraries. -TFL_IOS_BUILD_VERSION = "0.2.0" - TFL_MINIMUM_OS_VERSION = "9.0" # Default tags for filtering iOS targets. Targets are restricted to Apple platforms. From 5abd7942222cf1d7cfae4319fd0ee56113d3db7c Mon Sep 17 00:00:00 2001 From: Zhenyu Tan Date: Wed, 24 Jul 2019 17:25:29 -0700 Subject: [PATCH 0530/3053] compile with default optimizer and allow compile with multiple optimizers. PiperOrigin-RevId: 259851436 --- tensorflow/python/keras/engine/training.py | 10 ++++++++-- tensorflow/python/keras/engine/training_eager.py | 6 ++++++ .../tools/api/golden/v1/tensorflow.keras.-model.pbtxt | 2 +- .../api/golden/v1/tensorflow.keras.-sequential.pbtxt | 2 +- .../api/golden/v1/tensorflow.keras.models.-model.pbtxt | 2 +- .../v1/tensorflow.keras.models.-sequential.pbtxt | 2 +- .../tools/api/golden/v2/tensorflow.keras.-model.pbtxt | 2 +- .../api/golden/v2/tensorflow.keras.-sequential.pbtxt | 2 +- .../api/golden/v2/tensorflow.keras.models.-model.pbtxt | 2 +- .../v2/tensorflow.keras.models.-sequential.pbtxt | 2 +- 10 files changed, 22 insertions(+), 10 deletions(-) diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py index ee898f825c9..89e82106d50 100644 --- a/tensorflow/python/keras/engine/training.py +++ b/tensorflow/python/keras/engine/training.py @@ -177,7 +177,7 @@ class Model(network.Network): @trackable.no_automatic_dependency_tracking def compile(self, - optimizer, + optimizer='rmsprop', loss=None, metrics=None, loss_weights=None, @@ -274,7 +274,10 @@ class Model(network.Network): sample_weight_mode, target_tensors, weighted_metrics) - self.optimizer = optimizers.get(optimizer) + if isinstance(optimizer, (list, tuple)): + self.optimizer = [optimizers.get(opt) for opt in optimizer] + else: + self.optimizer = optimizers.get(optimizer) # We've disabled automatic dependency tracking for this method, but do want # to add a checkpoint dependency on the optimizer if it's trackable. if isinstance(self.optimizer, trackable.Trackable): @@ -2023,6 +2026,9 @@ class Model(network.Network): def _make_train_function(self): has_recompiled = self._recompile_weights_loss_and_weighted_metrics() self._check_trainable_weights_consistency() + if isinstance(self.optimizer, list): + raise ValueError('The `optimizer` in `compile` should be a single ' + 'optimizer.') # If we have re-compiled the loss/weighted metric sub-graphs then create # train function even if one exists already. This is because # `_feed_sample_weights` list has been updated on re-copmpile. diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py index 15b5ad3061b..8ca72160618 100644 --- a/tensorflow/python/keras/engine/training_eager.py +++ b/tensorflow/python/keras/engine/training_eager.py @@ -245,6 +245,12 @@ def _process_single_batch(model, if training: trainable_weights = model._unique_trainable_weights if trainable_weights: + # TODO(tanzheny) b/132690565: Provide mechanism for user to override + # model.train_on_batch. + if isinstance(model.optimizer, + list) and not hasattr(model, '_backwards'): + raise ValueError('The `optimizer` in `compile` should be a single ' + 'optimizer.') grads = tape.gradient(scaled_total_loss, trainable_weights) if isinstance(model.optimizer, loss_scale_optimizer.LossScaleOptimizer): grads = model.optimizer.get_unscaled_gradients(grads) diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt index a13e20be2dc..c28fd8a0725 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt @@ -169,7 +169,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt index 9ddbdf2b38c..c6336dfe9fe 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt @@ -174,7 +174,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt index 3840d3d7750..5b9368db391 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt @@ -169,7 +169,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt index 3d9f85c87ce..a08172cbc88 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt @@ -174,7 +174,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt index a13e20be2dc..c28fd8a0725 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt @@ -169,7 +169,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt index 9ddbdf2b38c..c6336dfe9fe 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt @@ -174,7 +174,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt index 3840d3d7750..5b9368db391 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt @@ -169,7 +169,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt index 3d9f85c87ce..a08172cbc88 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt @@ -174,7 +174,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" From f085af868c4d55e233c1086ca93693efd0389c87 Mon Sep 17 00:00:00 2001 From: Tong Shen Date: Wed, 24 Jul 2019 17:25:54 -0700 Subject: [PATCH 0531/3053] Fixes for XlaCompiler shape inference. 1. Populate _Arg shapes; 2. Run function inlining, shape inference, and then graph optimization (constant folding) on the graph. This is required for the following case: 1. a Tensor is passed into function A as input; 2. in function A, we get shape of the Tensor, and do some modifications to get another shape (e.g. extract only certain dimensions of the shape); 3. the modified shape is passed into another function B as input; 4. in function B, we use the modified shape as compile-time constant input for ops like Fill. This changes ensures in 2), we know the shape of the tensor, and can constant fold the modified shape. PiperOrigin-RevId: 259851491 --- tensorflow/compiler/tf2xla/BUILD | 1 + tensorflow/compiler/tf2xla/xla_compiler.cc | 55 ++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD index 6a28a5acb14..9aea4570cc7 100644 --- a/tensorflow/compiler/tf2xla/BUILD +++ b/tensorflow/compiler/tf2xla/BUILD @@ -207,6 +207,7 @@ cc_library( ":side_effect_util", ":tf2xla_util", "//tensorflow/compiler/jit:flags", + "//tensorflow/compiler/jit:shape_inference", "//tensorflow/compiler/jit:xla_cluster_util", "//tensorflow/compiler/tf2xla:rearrange_function_argument", "//tensorflow/compiler/tf2xla/lib:util", diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc index 3959f130c20..fe40e13fb33 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler.cc @@ -21,6 +21,7 @@ limitations under the License. #include "absl/memory/memory.h" #include "absl/types/variant.h" #include "tensorflow/compiler/jit/flags.h" +#include "tensorflow/compiler/jit/shape_inference.h" #include "tensorflow/compiler/tf2xla/graph_compiler.h" #include "tensorflow/compiler/tf2xla/rearrange_function_argument.h" #include "tensorflow/compiler/tf2xla/shape_util.h" @@ -529,6 +530,11 @@ Status XlaCompiler::FindFunctionBody(const NameAttrList& function, std::unique_ptr XlaCompiler::GetGraph(const FunctionBody* fbody) { std::unique_ptr graph(new Graph(options_.flib_def)); CopyGraph(*fbody->graph, graph.get()); + + // Performs a first function inlining pass before shape inference, since + // otherwise shape inference can't see inside functions and a comprehensive + // shape_map, including function ops, is needed to constant-propagate Shape + // Ops below. auto flags = GetBuildXlaOpsPassFlags(); OptimizerOptions opts; opts.set_opt_level(OptimizerOptions::L0); @@ -567,6 +573,28 @@ std::unique_ptr XlaCompiler::GetGraph(const FunctionBody* fbody) { optimizer.Optimize(flib_runtime_, flib_runtime_->env(), /*device=*/nullptr, &graph, graph_optimizer_options); + // Run shape inference on the graph and optimize the graph again. + GraphShapeInfo shape_info; + InferShapes(graph.get(), /*arg_shapes=*/{}, + flib_runtime_->GetFunctionLibraryDefinition(), &shape_info) + .IgnoreError(); + auto node_name_index = graph->BuildNodeNameIndex(); + std::unordered_map> shape_map; + for (const auto& node_shape_info : shape_info) { + const string& node_name = node_shape_info.first; + const std::vector& output_shapes = node_shape_info.second; + const auto& node_iter = node_name_index.find(node_name); + if (node_iter != node_name_index.end()) { + auto& partial_shapes = shape_map[node_name]; + for (const auto& inferred_shape : output_shapes) { + partial_shapes.push_back(inferred_shape.shape); + } + } + } + graph_optimizer_options.shape_map = &shape_map; + optimizer.Optimize(flib_runtime_, flib_runtime_->env(), + /*device=*/nullptr, &graph, graph_optimizer_options); + return graph; } @@ -593,6 +621,33 @@ Status XlaCompiler::CompileFunction( CheckSignature(fbody->arg_types, args), "Signature check failure while compiling: ", fn_name_attrs.name()); + // Set shapes for _Arg nodes. They are useful for constant folding (e.g. an + // Xla op requires a compile-time constant input, and that input is shape of + // an _Arg node. + for (int i = 0; i < args.size(); i++) { + // Skip resource variables and tensor lists. + DataType dtype; + TF_RETURN_IF_ERROR(GetNodeAttr(fbody->arg_nodes[i]->def(), "T", &dtype)); + if (dtype == DT_RESOURCE || dtype == DT_VARIANT) { + continue; + } + + if (absl::holds_alternative(args[i].shape)) { + xla::Shape xla_shape = absl::get(args[i].shape); + TensorShape tensor_shape; + if (XLAShapeToTensorShape(xla_shape, &tensor_shape).ok()) { + fbody->arg_nodes[i]->ClearAttr("_output_shapes"); + fbody->arg_nodes[i]->AddAttr("_output_shapes", + std::vector{tensor_shape}); + } + } else { + TensorShape tensor_shape = absl::get(args[i].shape); + fbody->arg_nodes[i]->ClearAttr("_output_shapes"); + fbody->arg_nodes[i]->AddAttr("_output_shapes", + std::vector{tensor_shape}); + } + } + std::unique_ptr graph = GetGraph(fbody); // Clear the "_kernel" attribute if it is set to "host". This is used to From 6b337b315e06930e4717a72b6217be790ccaef38 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 17:46:48 -0700 Subject: [PATCH 0532/3053] Cleaning up label_image.py example. PiperOrigin-RevId: 259854767 --- .../lite/examples/python/label_image.py | 55 ++++++++++--------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/tensorflow/lite/examples/python/label_image.py b/tensorflow/lite/examples/python/label_image.py index 0bc15d36a8a..e9eaa98fac9 100644 --- a/tensorflow/lite/examples/python/label_image.py +++ b/tensorflow/lite/examples/python/label_image.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""label_image for tflite""" +"""label_image for tflite.""" from __future__ import absolute_import from __future__ import division @@ -23,46 +23,49 @@ import numpy as np from PIL import Image -from tensorflow.lite.python import interpreter as interpreter_wrapper +from tensorflow.lite.python.interpreter import Interpreter + def load_labels(filename): - my_labels = [] - input_file = open(filename, 'r') - for l in input_file: - my_labels.append(l.strip()) - return my_labels + with open(filename, 'r') as f: + return [line.strip() for line in f.readlines()] -if __name__ == "__main__": - floating_model = False +if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument("-i", "--image", default="/tmp/grace_hopper.bmp", \ - help="image to be classified") - parser.add_argument("-m", "--model_file", \ - default="/tmp/mobilenet_v1_1.0_224_quant.tflite", \ - help=".tflite model to be executed") - parser.add_argument("-l", "--label_file", default="/tmp/labels.txt", \ - help="name of file containing labels") - parser.add_argument("--input_mean", default=127.5, help="input_mean") - parser.add_argument("--input_std", default=127.5, \ - help="input standard deviation") + parser.add_argument( + '-i', + '--image', + default='/tmp/grace_hopper.bmp', + help='image to be classified') + parser.add_argument( + '-m', + '--model_file', + default='/tmp/mobilenet_v1_1.0_224_quant.tflite', + help='.tflite model to be executed') + parser.add_argument( + '-l', + '--label_file', + default='/tmp/labels.txt', + help='name of file containing labels') + parser.add_argument('--input_mean', default=127.5, help='input_mean') + parser.add_argument( + '--input_std', default=127.5, help='input standard deviation') args = parser.parse_args() - interpreter = interpreter_wrapper.Interpreter(model_path=args.model_file) + interpreter = Interpreter(model_path=args.model_file) interpreter.allocate_tensors() input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() # check the type of the input tensor - if input_details[0]['dtype'] == np.float32: - floating_model = True + floating_model = input_details[0]['dtype'] == np.float32 # NxHxWxC, H:1, W:2 height = input_details[0]['shape'][1] width = input_details[0]['shape'][2] - img = Image.open(args.image) - img = img.resize((width, height)) + img = Image.open(args.image).resize((width, height)) # add N dim input_data = np.expand_dims(img, axis=0) @@ -81,6 +84,6 @@ if __name__ == "__main__": labels = load_labels(args.label_file) for i in top_k: if floating_model: - print('{0:08.6f}'.format(float(results[i]))+":", labels[i]) + print('{:08.6f}: {}'.format(float(results[i]), labels[i])) else: - print('{0:08.6f}'.format(float(results[i]/255.0))+":", labels[i]) + print('{:08.6f}: {}'.format(float(results[i] / 255.0), labels[i])) From 272d69f23c2636ca45e837bd47c366c223702ac6 Mon Sep 17 00:00:00 2001 From: Scott Zhu Date: Wed, 24 Jul 2019 17:47:14 -0700 Subject: [PATCH 0533/3053] Update training_v2 to count of samples if the total number is known. This will bring back the existing behavior of progress bar and callbacks if they rely on the counting of number of example. Also update the callback test to use v2 optimizer, since v1 will fail with run_distributed = True. PiperOrigin-RevId: 259854861 --- tensorflow/python/keras/callbacks_test.py | 4 +- tensorflow/python/keras/engine/training_v2.py | 72 ++++++++++++------- 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py index f072384d09f..8aca40f80aa 100644 --- a/tensorflow/python/keras/callbacks_test.py +++ b/tensorflow/python/keras/callbacks_test.py @@ -869,7 +869,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase): num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM) model.compile( loss='categorical_crossentropy', - optimizer=keras.optimizers.SGD(lr=0.1)) + optimizer=gradient_descent.SGD(lr=0.1)) return model # TODO(psv): Make sure the callback works correctly when min_delta is @@ -975,7 +975,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase): num_hidden=NUM_HIDDEN, num_classes=NUM_CLASSES, input_dim=INPUT_DIM) model.compile( loss='categorical_crossentropy', - optimizer=keras.optimizers.SGD(lr=0.1), + optimizer=gradient_descent.SGD(lr=0.1), metrics=['accuracy']) return model diff --git a/tensorflow/python/keras/engine/training_v2.py b/tensorflow/python/keras/engine/training_v2.py index 7e89312d891..5d098476800 100644 --- a/tensorflow/python/keras/engine/training_v2.py +++ b/tensorflow/python/keras/engine/training_v2.py @@ -59,10 +59,10 @@ def run_one_epoch(model, batch_size=None, strategy=None, steps_per_epoch=None, + num_samples=None, mode=ModeKeys.TRAIN, training_context=None, - total_epochs=None, - partical_batch_size=None): + total_epochs=None): """Run the execution function with the data from iterator. Given the dataset iterator and execution function, get the data from iterator @@ -77,21 +77,18 @@ def run_one_epoch(model, batch_size: The size of the current batch. strategy: the distribution strategy instance from the model. steps_per_epoch: the number of steps to run for the epoch. + num_samples: the number of samples for the whole epoch if known. This can be + used to calculate the final partial batch, and scale the loss. mode: the mode for the current epoch. training_context: the context that contains callbacks and progress bar. total_epochs: the total number of epochs that will be run. Used when throw error when the iterator unexpectedly reaches its end. - partical_batch_size: the size of the final batch if it is already known. It - will be used to scale the loss value for the final batch. Returns: The loss and metric value from the model. """ # Only use the sample to count if there is a partial batch at the end. - use_steps = not (partical_batch_size and batch_size and steps_per_epoch and - steps_per_epoch == dataset_size) - num_samples = None if use_steps else batch_size * (steps_per_epoch - - 1) + partical_batch_size + use_steps = num_samples is None if mode == ModeKeys.PREDICT: aggregator = training_utils.OutputsAggregator( @@ -112,10 +109,17 @@ def run_one_epoch(model, step = 0 while step < target_steps: + if use_steps: + current_batch_size = 1 + elif step < target_steps - 1: + current_batch_size = batch_size + else: + current_batch_size = num_samples - step * batch_size + # TODO(scottzhu): Maybe update the training context to take into account # whether a batch of training happens. Then it could still use a # context manager - batch_logs = {'batch': step, 'size': 1} + batch_logs = {'batch': step, 'size': current_batch_size} training_context.callbacks._call_batch_hook( mode, 'begin', step, batch_logs) training_context.progbar.on_batch_begin(step, batch_logs) @@ -162,7 +166,7 @@ def run_one_epoch(model, aggregator.aggregate( batch_outs, batch_start=step * batch_size, - batch_end=min((step + 1) * batch_size, num_samples)) + batch_end=step * batch_size + current_batch_size) cbks.make_logs(model, batch_logs, batch_outs, mode) training_context.callbacks._call_batch_hook( @@ -216,6 +220,8 @@ class Loop(training_utils.TrainingLoop): validation_steps=validation_steps, distribution_strategy=strategy) + total_samples = _get_total_number_of_samples(training_data_adapter) + use_sample = total_samples is not None do_validation = (validation_adapter is not None) if not steps_per_epoch: @@ -273,11 +279,13 @@ class Loop(training_utils.TrainingLoop): batch_size=batch_size, epochs=epochs, steps_per_epoch=steps_per_epoch, - samples=None, + samples=total_samples, + count_mode='samples' if use_sample else 'steps', verbose=0, # Handle ProgBarLogger separately in this loop. mode=ModeKeys.TRAIN) - with training_context.on_start(model, callbacks, verbose, ModeKeys.TRAIN): + with training_context.on_start( + model, callbacks, use_sample, verbose, ModeKeys.TRAIN): # TODO(scottzhu): Handle TPUStrategy training loop for epoch in range(initial_epoch, epochs): if training_context.callbacks.model.stop_training: @@ -303,10 +311,10 @@ class Loop(training_utils.TrainingLoop): batch_size=training_data_adapter.batch_size(), strategy=strategy, steps_per_epoch=steps_per_epoch, + num_samples=total_samples, mode=ModeKeys.TRAIN, training_context=training_context, - total_epochs=epochs, - partical_batch_size=training_data_adapter.partial_batch_size()) + total_epochs=epochs) cbks.make_logs(model, epoch_logs, training_result, ModeKeys.TRAIN) # Evaluation @@ -321,9 +329,11 @@ class Loop(training_utils.TrainingLoop): else: eval_data_iter = iter(validation_dataset) + val_total_samples = _get_total_number_of_samples( + validation_adapter) eval_context = TrainingContext() with eval_context.on_start( - model, callbacks, verbose=0, mode=ModeKeys.TEST): + model, callbacks, use_sample, verbose=0, mode=ModeKeys.TEST): with eval_context.on_epoch(epoch, ModeKeys.TEST): model.reset_metrics() eval_result = run_one_epoch( @@ -334,11 +344,10 @@ class Loop(training_utils.TrainingLoop): batch_size=validation_adapter.batch_size(), strategy=strategy, steps_per_epoch=validation_steps, + num_samples=val_total_samples, mode=ModeKeys.TEST, training_context=eval_context, - total_epochs=1, - partical_batch_size=validation_adapter.partial_batch_size( - )) + total_epochs=1) cbks.make_logs(model, epoch_logs, eval_result, ModeKeys.TEST, prefix='val_') @@ -365,6 +374,8 @@ class Loop(training_utils.TrainingLoop): sample_weights=sample_weight, steps=steps, distribution_strategy=strategy) + total_samples = _get_total_number_of_samples(adapter) + use_sample = total_samples is not None if not steps: steps = adapter.get_size() @@ -393,11 +404,13 @@ class Loop(training_utils.TrainingLoop): batch_size=batch_size, epochs=1, steps_per_epoch=steps, - samples=None, + samples=use_sample, + count_mode='samples' if use_sample else 'steps', verbose=0, # Handle ProgBarLogger separately in this loop. mode=mode) - with training_context.on_start(model, callbacks, verbose, mode): + with training_context.on_start( + model, callbacks, use_sample, verbose, mode): # TODO(scottzhu): Handle TPUStrategy training loop with training_context.on_epoch(0, mode) as epoch_logs: model.reset_metrics() @@ -409,10 +422,10 @@ class Loop(training_utils.TrainingLoop): batch_size=adapter.batch_size(), strategy=strategy, steps_per_epoch=steps, + num_samples=total_samples, mode=mode, training_context=training_context, - total_epochs=1, - partical_batch_size=adapter.partial_batch_size()) + total_epochs=1) cbks.make_logs(model, epoch_logs, result, mode) if len(result) == 1: @@ -571,14 +584,25 @@ def _update_sample_weight_mode(model, mode, dataset): del iterator +def _get_total_number_of_samples(adapter): + if not adapter.get_size() or not adapter.batch_size(): + return None + total_sample = adapter.get_size() * adapter.batch_size() + if adapter.has_partial_batch(): + total_sample -= (adapter.batch_size() - adapter.partial_batch_size()) + return total_sample + + class TrainingContext(object): """Utility object that wrap around callbacks and progress bars.""" @tf_contextlib.contextmanager - def on_start(self, model, callbacks=None, verbose=0, mode=ModeKeys.TRAIN): + def on_start(self, model, callbacks=None, use_samples=False, verbose=0, + mode=ModeKeys.TRAIN): """Provide a scope for the whole training process.""" # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready. - progbar = training_utils.get_progbar(model, 'steps') + progbar = training_utils.get_progbar( + model, 'samples' if use_samples else 'steps') progbar.params = callbacks.params progbar.params['verbose'] = verbose callbacks.model.stop_training = False From 0fa0d44944abd86578fa076802f5a8a7490d5656 Mon Sep 17 00:00:00 2001 From: Anna R Date: Wed, 24 Jul 2019 18:04:24 -0700 Subject: [PATCH 0534/3053] Handle symlinks in tf_upgrade_v2 script as follows: In place upgrade: - Leave symlinks untouched Upgrade with output directory: - Create a new symlink in output directory if the symlink target is inside input directory. - Copy symlink to output directory if the symlink target is not inside input directory. This should address part of #26902 although the behavior is slightly different. Specifically, I am keeping symlinks untouched if they point to a file in a directory that we are not upgrading (as opposed to changing them to regular files). PiperOrigin-RevId: 259857509 --- tensorflow/tools/compatibility/ast_edits.py | 18 ++++ .../tools/compatibility/ast_edits_test.py | 84 +++++++++++++++++++ 2 files changed, 102 insertions(+) diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py index e80bdc47b82..70ed82dd009 100644 --- a/tensorflow/tools/compatibility/ast_edits.py +++ b/tensorflow/tools/compatibility/ast_edits.py @@ -1032,10 +1032,25 @@ class ASTCodeUpgrader(object): output_directory = os.path.dirname(output_path) if not os.path.isdir(output_directory): os.makedirs(output_directory) + + if os.path.islink(input_path): + link_target = os.readlink(input_path) + link_target_output = os.path.join( + output_root_directory, os.path.relpath(link_target, root_directory)) + if (link_target, link_target_output) in files_to_process: + # Create a link to the new location of the target file + os.symlink(link_target_output, output_path) + else: + report += "Copying symlink %s without modifying its target %s" % ( + input_path, link_target) + os.symlink(link_target, output_path) + continue + file_count += 1 _, l_report, l_errors = self.process_file(input_path, output_path) tree_errors[input_path] = l_errors report += l_report + for input_path, output_path in files_to_copy: output_directory = os.path.dirname(output_path) if not os.path.isdir(output_directory): @@ -1059,6 +1074,9 @@ class ASTCodeUpgrader(object): report += ("=" * 80) + "\n" for path in files_to_process: + if os.path.islink(path): + report += "Skipping symlink %s.\n" % path + continue file_count += 1 _, l_report, l_errors = self.process_file(path, path) tree_errors[path] = l_errors diff --git a/tensorflow/tools/compatibility/ast_edits_test.py b/tensorflow/tools/compatibility/ast_edits_test.py index 0bc87d17d53..d6a366d7220 100644 --- a/tensorflow/tools/compatibility/ast_edits_test.py +++ b/tensorflow/tools/compatibility/ast_edits_test.py @@ -45,6 +45,7 @@ from __future__ import division from __future__ import print_function import ast +import os import six from tensorflow.python.framework import test_util @@ -605,6 +606,89 @@ def t(): _, new_text = self._upgrade(RenameImports(), text) self.assertEqual(expected_text, new_text) + def testUpgradeInplaceWithSymlink(self): + upgrade_dir = os.path.join(self.get_temp_dir(), "foo") + os.mkdir(upgrade_dir) + file_a = os.path.join(upgrade_dir, "a.py") + file_b = os.path.join(upgrade_dir, "b.py") + + with open(file_a, "a") as f: + f.write("import foo as f") + os.symlink(file_a, file_b) + + upgrader = ast_edits.ASTCodeUpgrader(RenameImports()) + upgrader.process_tree_inplace(upgrade_dir) + + self.assertTrue(os.path.islink(file_b)) + self.assertEqual(file_a, os.readlink(file_b)) + with open(file_a, "r") as f: + self.assertEqual("import bar as f", f.read()) + + def testUpgradeInPlaceWithSymlinkInDifferentDir(self): + upgrade_dir = os.path.join(self.get_temp_dir(), "foo") + other_dir = os.path.join(self.get_temp_dir(), "bar") + os.mkdir(upgrade_dir) + os.mkdir(other_dir) + file_c = os.path.join(other_dir, "c.py") + file_d = os.path.join(upgrade_dir, "d.py") + + with open(file_c, "a") as f: + f.write("import foo as f") + os.symlink(file_c, file_d) + + upgrader = ast_edits.ASTCodeUpgrader(RenameImports()) + upgrader.process_tree_inplace(upgrade_dir) + + self.assertTrue(os.path.islink(file_d)) + self.assertEqual(file_c, os.readlink(file_d)) + # File pointed to by symlink is in a different directory. + # Therefore, it should not be upgraded. + with open(file_c, "r") as f: + self.assertEqual("import foo as f", f.read()) + + def testUpgradeCopyWithSymlink(self): + upgrade_dir = os.path.join(self.get_temp_dir(), "foo") + output_dir = os.path.join(self.get_temp_dir(), "bar") + os.mkdir(upgrade_dir) + file_a = os.path.join(upgrade_dir, "a.py") + file_b = os.path.join(upgrade_dir, "b.py") + + with open(file_a, "a") as f: + f.write("import foo as f") + os.symlink(file_a, file_b) + + upgrader = ast_edits.ASTCodeUpgrader(RenameImports()) + upgrader.process_tree(upgrade_dir, output_dir, copy_other_files=True) + + new_file_a = os.path.join(output_dir, "a.py") + new_file_b = os.path.join(output_dir, "b.py") + self.assertTrue(os.path.islink(new_file_b)) + self.assertEqual(new_file_a, os.readlink(new_file_b)) + with open(new_file_a, "r") as f: + self.assertEqual("import bar as f", f.read()) + + def testUpgradeCopyWithSymlinkInDifferentDir(self): + upgrade_dir = os.path.join(self.get_temp_dir(), "foo") + other_dir = os.path.join(self.get_temp_dir(), "bar") + output_dir = os.path.join(self.get_temp_dir(), "baz") + os.mkdir(upgrade_dir) + os.mkdir(other_dir) + file_a = os.path.join(other_dir, "a.py") + file_b = os.path.join(upgrade_dir, "b.py") + + with open(file_a, "a") as f: + f.write("import foo as f") + os.symlink(file_a, file_b) + + upgrader = ast_edits.ASTCodeUpgrader(RenameImports()) + upgrader.process_tree(upgrade_dir, output_dir, copy_other_files=True) + + new_file_b = os.path.join(output_dir, "b.py") + self.assertTrue(os.path.islink(new_file_b)) + self.assertEqual(file_a, os.readlink(new_file_b)) + with open(file_a, "r") as f: + self.assertEqual("import foo as f", f.read()) + if __name__ == "__main__": test_lib.main() From f6c97840e2e87d02906b7cbbf808febedc50a027 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 18:14:52 -0700 Subject: [PATCH 0535/3053] Add delegate support for BATCH_TO_SPACE_ND PiperOrigin-RevId: 259858930 --- .../lite/delegates/nnapi/nnapi_delegate.cc | 18 ++++++++++++++++++ tensorflow/lite/kernels/BUILD | 1 + 2 files changed, 19 insertions(+) diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc index 837ae62f2bd..87c89dde4fc 100644 --- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc +++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc @@ -141,6 +141,7 @@ bool NeedInt8Conversion(const TfLiteContext* context, int builtin_code, } return false; } + case kTfLiteBuiltinBatchToSpaceNd: case kTfLiteBuiltinL2Normalization: case kTfLiteBuiltinSub: case kTfLiteBuiltinTanh: @@ -1501,6 +1502,18 @@ class NNAPIDelegateKernel { return BasicMappingFn; } break; + case kTfLiteBuiltinBatchToSpaceNd: + if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) { + auto crops = context->tensors[node->inputs->data[2]]; + auto crops_data = crops.data.i32; + // Check if all crops are 0. + if (!crops_data || crops.bytes != 16 || crops_data[0] != 0 || + crops_data[1] != 0 || crops_data[2] != 0 || crops_data[3] != 0) { + return nullptr; + } + return BasicMappingFn; + } + break; case kTfLiteBuiltinStridedSlice: if (version == 1 && android_sdk_version >= kMinSdkVersionForNNAPI11) { return [](const NNAPIOpMappingArgs& mapping_args) @@ -2636,6 +2649,11 @@ class NNAPIDelegateKernel { input_pos == 1) { // The axis param is added during Map continue; + } else if (reg->builtin_code == kTfLiteBuiltinBatchToSpaceNd && + input_pos == 2) { + // NNAPI does not support crops. + // The Map fucntion will check if all crops are zero. + continue; } else if (reg->builtin_code == kTfLiteBuiltinArgMin || reg->builtin_code == kTfLiteBuiltinArgMax) { // The first input tensor is added as is. The second one, specifying diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD index 9afe0c8a4e6..bca715a8ce5 100644 --- a/tensorflow/lite/kernels/BUILD +++ b/tensorflow/lite/kernels/BUILD @@ -708,6 +708,7 @@ cc_test( name = "batch_to_space_nd_test", size = "small", srcs = ["batch_to_space_nd_test.cc"], + tags = ["tflite_nnapi"], deps = [ ":builtin_ops", ":test_main", From 57e0d1acc21507e347953d0c2deceb99cdbd0b33 Mon Sep 17 00:00:00 2001 From: Frank Chen Date: Wed, 24 Jul 2019 18:43:43 -0700 Subject: [PATCH 0536/3053] Mechanical replacement of download.tensorflow.org with https equivalent. PiperOrigin-RevId: 259862509 --- WORKSPACE | 10 +-- .../mlir/tensorflow/ir/control_flow_ops.h | 12 +-- .../mlir/tensorflow/ir/tf_executor_ops.td | 14 ++-- .../nmt_with_attention.ipynb | 2 +- .../eval/python/classifier_metrics_impl.py | 4 +- .../contrib/makefile/download_dependencies.sh | 2 +- tensorflow/examples/android/README.md | 6 +- .../generate_streaming_test_wav.py | 2 +- tensorflow/examples/speech_commands/train.py | 2 +- .../lite/examples/ios/download_models.sh | 4 +- tensorflow/lite/examples/python/README.md | 2 +- tensorflow/lite/g3doc/guide/hosted_models.md | 80 +++++++++---------- .../lite/g3doc/models/smart_reply/overview.md | 2 +- .../lite/g3doc/performance/benchmarks.md | 8 +- tensorflow/lite/java/demo/app/build.gradle | 4 +- .../lite/models/smartreply/g3doc/README.md | 4 +- tensorflow/lite/tools/benchmark/ios/README.md | 2 +- tensorflow/tools/graph_transforms/README.md | 4 +- tensorflow/workspace.bzl | 8 +- 19 files changed, 87 insertions(+), 85 deletions(-) diff --git a/WORKSPACE b/WORKSPACE index d2c65bc1b1d..86830a09476 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -105,7 +105,7 @@ http_archive( sha256 = "7efe12a8363f09bc24d7b7a450304a15655a57a7751929b2c1593a71183bb105", urls = [ "http://storage.googleapis.com/download.tensorflow.org/models/inception_v1.zip", - "http://download.tensorflow.org/models/inception_v1.zip", + "https://storage.googleapis.com/download.tensorflow.org/models/inception_v1.zip", ], ) @@ -115,7 +115,7 @@ http_archive( sha256 = "bddd81ea5c80a97adfac1c9f770e6f55cbafd7cce4d3bbe15fbeb041e6b8f3e8", urls = [ "http://storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_android_export.zip", - "http://download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_android_export.zip", + "https://storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_android_export.zip", ], ) @@ -125,7 +125,7 @@ http_archive( sha256 = "859edcddf84dddb974c36c36cfc1f74555148e9c9213dedacf1d6b613ad52b96", urls = [ "http://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1a.zip", - "http://download.tensorflow.org/models/mobile_multibox_v1a.zip", + "https://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1a.zip", ], ) @@ -135,7 +135,7 @@ http_archive( sha256 = "3d374a730aef330424a356a8d4f04d8a54277c425e274ecb7d9c83aa912c6bfa", urls = [ "http://storage.googleapis.com/download.tensorflow.org/models/stylize_v1.zip", - "http://download.tensorflow.org/models/stylize_v1.zip", + "https://storage.googleapis.com/download.tensorflow.org/models/stylize_v1.zip", ], ) @@ -145,6 +145,6 @@ http_archive( sha256 = "c3ec4fea3158eb111f1d932336351edfe8bd515bb6e87aad4f25dbad0a600d0c", urls = [ "http://storage.googleapis.com/download.tensorflow.org/models/speech_commands_v0.01.zip", - "http://download.tensorflow.org/models/speech_commands_v0.01.zip", + "https://storage.googleapis.com/download.tensorflow.org/models/speech_commands_v0.01.zip", ], ) diff --git a/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h index 2756b4c0885..4bf7029421e 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h +++ b/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h @@ -65,7 +65,7 @@ class TFControlType : public Type::TypeBase { // tensor needs its own _tf.Enter to be made available inside the while loop. // // More details can be found in Tensorflow Controlflow white paper: -// http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf +// https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf // // This is defined in Tensorflow as: // @@ -100,7 +100,7 @@ class EnterOp // of the operand type along with the index of the first match encountered. // // More details can be found in Tensorflow Controlflow white paper: -// http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf +// https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf // // This is defined in TensorFlow as: // @@ -130,7 +130,7 @@ class MergeOp : public Op::Impl, // outside of loop. Each returned tensor needs its own _tf.Exit. // // More details can be found in Tensorflow Controlflow white paper: -// http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf +// https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf // // This is defined in Tensorflow as: // diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td index 748416a8142..d8b92468cd0 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_executor_ops.td @@ -221,7 +221,7 @@ def TfExecutor_SwitchOp : TfExecutor_Op<"Switch", let description = [{ More details can be found in Tensorflow Control Flow white paper: - http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf + https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf This is defined in TensorFlow as: @@ -302,7 +302,7 @@ def TfExecutor_MergeOp : TfExecutor_Op<"Merge", [NoSideEffect, ControlOperandsAf let description = [{ More details can be found in Tensorflow Control Flow white paper: - http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf + https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf This is defined in TensorFlow as: @@ -339,7 +339,7 @@ def TfExecutor_EnterOp : TfExecutor_Op<"Enter", let description = [{ More details can be found in Tensorflow Control Flow white paper: - http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf + https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf Each tensor needs its own tf_executor.Enter to be made available inside a while loop. @@ -390,7 +390,7 @@ def TfExecutor_NextIterationSourceOp : TfExecutor_Op<"NextIteration.Source", [No of a while loop. Each loop variable needs its own NextIteration op. More details can be found in Tensorflow Control Flow white paper: - http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf + https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf In the TF executor dialect, the NextIteration op is broken into tf_executor.NextIteration.sink and tf_executor.NextIteration.source because @@ -447,7 +447,7 @@ def TfExecutor_NextIterationSinkOp : TfExecutor_Op<"NextIteration.Sink"> { of a while loop. Each loop variable needs its own NextIteration op. More details can be found in Tensorflow Control Flow white paper: - http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf + https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf In the TF executor dialect, the NextIteration op is broken into tf_executor.NextIteration.sink and tf_executor.NextIteration.source because @@ -507,7 +507,7 @@ def TfExecutor_ExitOp : TfExecutor_Op<"Exit", let description = [{ More details can be found in Tensorflow Control Flow white paper: - http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf + https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf This is defined in Tensorflow as: @@ -579,7 +579,7 @@ def TfExecutor_LoopCondOp : TfExecutor_Op<"LoopCond", [NoSideEffect]> { let description = [{ More details can be found in Tensorflow Control Flow white paper: - http://download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf + https://storage.googleapis.com/download.tensorflow.org/paper/white_paper_tf_control_flow_implementation_2017_11_1.pdf This is defined in Tensorflow as: diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb index 512605a17eb..cabc71c98e1 100644 --- a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb +++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb @@ -117,7 +117,7 @@ "source": [ "# Download the file\n", "path_to_zip = tf.keras.utils.get_file(\n", - " 'spa-eng.zip', origin='http://download.tensorflow.org/data/spa-eng.zip', \n", + " 'spa-eng.zip', origin='https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip', \n", " extract=True)\n", "\n", "path_to_file = os.path.dirname(path_to_zip)+\"/spa-eng/spa.txt\"" diff --git a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py index 2c301267900..43e1c69bf73 100644 --- a/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py +++ b/tensorflow/contrib/gan/python/eval/python/classifier_metrics_impl.py @@ -74,7 +74,7 @@ __all__ = [ 'INCEPTION_DEFAULT_IMAGE_SIZE', ] -INCEPTION_URL = 'http://download.tensorflow.org/models/frozen_inception_v1_2015_12_05.tar.gz' +INCEPTION_URL = 'https://storage.googleapis.com/download.tensorflow.org/models/frozen_inception_v1_2015_12_05.tar.gz' INCEPTION_FROZEN_GRAPH = 'inceptionv1_for_inception_score.pb' INCEPTION_INPUT = 'Mul:0' INCEPTION_OUTPUT = 'logits:0' @@ -123,7 +123,7 @@ def preprocess_image(images, """Prepare a batch of images for evaluation. This is the preprocessing portion of the graph from - http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz. + https://storage.googleapis.com/download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz. Note that it expects Tensors in [0, 255]. This function maps pixel values to [-1, 1] and resizes to match the InceptionV1 network. diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh index efa122b34d8..6cf1145021c 100755 --- a/tensorflow/contrib/makefile/download_dependencies.sh +++ b/tensorflow/contrib/makefile/download_dependencies.sh @@ -140,7 +140,7 @@ replace_by_sed 's#static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );#s replace_by_sed 's#static uint64x2_t p2ul_CONJ_XOR = vld1q_u64( p2ul_conj_XOR_DATA );#static uint64x2_t p2ul_CONJ_XOR;// = vld1q_u64( p2ul_conj_XOR_DATA ); - Removed by script#' \ "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h" # TODO(satok): Remove this once protobuf/autogen.sh is fixed. -replace_by_sed 's#https://googlemock.googlecode.com/files/gmock-1.7.0.zip#http://download.tensorflow.org/deps/gmock-1.7.0.zip#' \ +replace_by_sed 's#https://googlemock.googlecode.com/files/gmock-1.7.0.zip#https://storage.googleapis.com/download.tensorflow.org/deps/gmock-1.7.0.zip#' \ "${DOWNLOADS_DIR}/protobuf/autogen.sh" cat "third_party/eigen3/gebp_neon.patch" | patch "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h" diff --git a/tensorflow/examples/android/README.md b/tensorflow/examples/android/README.md index 4e4e1685f6d..bb646d2da0e 100644 --- a/tensorflow/examples/android/README.md +++ b/tensorflow/examples/android/README.md @@ -45,7 +45,7 @@ on API >= 14 devices. ## Prebuilt Components: -The fastest path to trying the demo is to download the [prebuilt demo APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk). +The fastest path to trying the demo is to download the [prebuilt demo APK](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk). Also available are precompiled native libraries, and a jcenter package that you may simply drop into your own applications. See @@ -109,7 +109,9 @@ protobuf compilation. NOTE: Bazel does not currently support building for Android on Windows. Full support for gradle/cmake builds is coming soon, but in the meantime we suggest -that Windows users download the [prebuilt demo APK](http://download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk) instead. +that Windows users download the +[prebuilt demo APK](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/TfLiteCameraDemo.apk) +instead. ##### Install Bazel and Android Prerequisites diff --git a/tensorflow/examples/speech_commands/generate_streaming_test_wav.py b/tensorflow/examples/speech_commands/generate_streaming_test_wav.py index 98589069277..d3df7f4613e 100644 --- a/tensorflow/examples/speech_commands/generate_streaming_test_wav.py +++ b/tensorflow/examples/speech_commands/generate_streaming_test_wav.py @@ -174,7 +174,7 @@ if __name__ == '__main__': '--data_url', type=str, # pylint: disable=line-too-long - default='http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz', + default='https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.01.tar.gz', # pylint: enable=line-too-long help='Location of speech training data') parser.add_argument( diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py index 43a399b912e..3686b7dd2b2 100644 --- a/tensorflow/examples/speech_commands/train.py +++ b/tensorflow/examples/speech_commands/train.py @@ -301,7 +301,7 @@ if __name__ == '__main__': '--data_url', type=str, # pylint: disable=line-too-long - default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', + default='https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz', # pylint: enable=line-too-long help='Location of speech training data archive on the web.') parser.add_argument( diff --git a/tensorflow/lite/examples/ios/download_models.sh b/tensorflow/lite/examples/ios/download_models.sh index a450aba042e..68a9c96b84e 100755 --- a/tensorflow/lite/examples/ios/download_models.sh +++ b/tensorflow/lite/examples/ios/download_models.sh @@ -17,8 +17,8 @@ set -ex SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -FLOAT_MODEL_URL="http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz" -QUANTIZED_MODEL_URL="http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz" +FLOAT_MODEL_URL="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz" +QUANTIZED_MODEL_URL="https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz" DOWNLOADS_DIR=$(mktemp -d) cd "$SCRIPT_DIR" diff --git a/tensorflow/lite/examples/python/README.md b/tensorflow/lite/examples/python/README.md index b5ad7d1a412..ddfedb2916c 100644 --- a/tensorflow/lite/examples/python/README.md +++ b/tensorflow/lite/examples/python/README.md @@ -18,7 +18,7 @@ a good demonstration of a model trained to recognize 1,000 different objects. # Get photo curl https://raw.githubusercontent.com/tensorflow/tensorflow/master/tensorflow/lite/examples/label_image/testdata/grace_hopper.bmp > /tmp/grace_hopper.bmp # Get model -curl http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz | tar xzv -C /tmp +curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz | tar xzv -C /tmp # Get labels curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz | tar xzv -C /tmp mobilenet_v1_1.0_224/labels.txt diff --git a/tensorflow/lite/g3doc/guide/hosted_models.md b/tensorflow/lite/g3doc/guide/hosted_models.md index 323d31ba897..ba26ff80065 100644 --- a/tensorflow/lite/g3doc/guide/hosted_models.md +++ b/tensorflow/lite/g3doc/guide/hosted_models.md @@ -21,29 +21,29 @@ For more information about image classification, see classification models offer the smallest model size and fastest performance, at the expense of accuracy. -Model name | Paper and model | Model size | Top-1 accuracy | Top-5 accuracy | TF Lite performance ---------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | ------------------: -Mobilenet_V1_0.25_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_128_quant.tgz) | 0.5 Mb | 39.5% | 64.4% | 3.7 ms -Mobilenet_V1_0.25_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_160_quant.tgz) | 0.5 Mb | 42.8% | 68.1% | 5.5 ms -Mobilenet_V1_0.25_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_192_quant.tgz) | 0.5 Mb | 45.7% | 70.8% | 7.9 ms -Mobilenet_V1_0.25_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_224_quant.tgz) | 0.5 Mb | 48.2% | 72.8% | 10.4 ms -Mobilenet_V1_0.50_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_128_quant.tgz) | 1.4 Mb | 54.9% | 78.1% | 8.8 ms -Mobilenet_V1_0.50_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_160_quant.tgz) | 1.4 Mb | 57.2% | 80.5% | 13.0 ms -Mobilenet_V1_0.50_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_192_quant.tgz) | 1.4 Mb | 59.9% | 82.1% | 18.3 ms -Mobilenet_V1_0.50_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_224_quant.tgz) | 1.4 Mb | 61.2% | 83.2% | 24.7 ms -Mobilenet_V1_0.75_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_128_quant.tgz) | 2.6 Mb | 55.9% | 79.1% | 16.2 ms -Mobilenet_V1_0.75_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_160_quant.tgz) | 2.6 Mb | 62.4% | 83.7% | 24.3 ms -Mobilenet_V1_0.75_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_192_quant.tgz) | 2.6 Mb | 66.1% | 86.2% | 33.8 ms -Mobilenet_V1_0.75_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_224_quant.tgz) | 2.6 Mb | 66.9% | 86.9% | 45.4 ms -Mobilenet_V1_1.0_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_128_quant.tgz) | 4.3 Mb | 63.3% | 84.1% | 24.9 ms -Mobilenet_V1_1.0_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_160_quant.tgz) | 4.3 Mb | 66.9% | 86.7% | 37.4 ms -Mobilenet_V1_1.0_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_192_quant.tgz) | 4.3 Mb | 69.1% | 88.1% | 51.9 ms -Mobilenet_V1_1.0_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz) | 4.3 Mb | 70.0% | 89.0% | 70.2 ms -Mobilenet_V2_1.0_224_quant | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz) | 3.4 Mb | 70.8% | 89.9% | 53.4 ms -Inception_V1_quant | [paper](https://arxiv.org/abs/1409.4842), [tflite&pb](http://download.tensorflow.org/models/inception_v1_224_quant_20181026.tgz) | 6.4 Mb | 70.1% | 89.8% | 154.5 ms -Inception_V2_quant | [paper](https://arxiv.org/abs/1512.00567), [tflite&pb](http://download.tensorflow.org/models/inception_v2_224_quant_20181026.tgz) | 11 Mb | 73.5% | 91.4% | 235.0 ms -Inception_V3_quant | [paper](https://arxiv.org/abs/1806.08342),[tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz) | 23 Mb | 77.5% | 93.7% | 637 ms -Inception_V4_quant | [paper](https://arxiv.org/abs/1602.07261), [tflite&pb](http://download.tensorflow.org/models/inception_v4_299_quant_20181026.tgz) | 41 Mb | 79.5% | 93.9% | 1250.8 ms +Model name | Paper and model | Model size | Top-1 accuracy | Top-5 accuracy | TF Lite performance +--------------------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | ------------------: +Mobilenet_V1_0.25_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_128_quant.tgz) | 0.5 Mb | 39.5% | 64.4% | 3.7 ms +Mobilenet_V1_0.25_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_160_quant.tgz) | 0.5 Mb | 42.8% | 68.1% | 5.5 ms +Mobilenet_V1_0.25_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_192_quant.tgz) | 0.5 Mb | 45.7% | 70.8% | 7.9 ms +Mobilenet_V1_0.25_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.25_224_quant.tgz) | 0.5 Mb | 48.2% | 72.8% | 10.4 ms +Mobilenet_V1_0.50_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_128_quant.tgz) | 1.4 Mb | 54.9% | 78.1% | 8.8 ms +Mobilenet_V1_0.50_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_160_quant.tgz) | 1.4 Mb | 57.2% | 80.5% | 13.0 ms +Mobilenet_V1_0.50_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_192_quant.tgz) | 1.4 Mb | 59.9% | 82.1% | 18.3 ms +Mobilenet_V1_0.50_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.5_224_quant.tgz) | 1.4 Mb | 61.2% | 83.2% | 24.7 ms +Mobilenet_V1_0.75_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_128_quant.tgz) | 2.6 Mb | 55.9% | 79.1% | 16.2 ms +Mobilenet_V1_0.75_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_160_quant.tgz) | 2.6 Mb | 62.4% | 83.7% | 24.3 ms +Mobilenet_V1_0.75_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_192_quant.tgz) | 2.6 Mb | 66.1% | 86.2% | 33.8 ms +Mobilenet_V1_0.75_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_0.75_224_quant.tgz) | 2.6 Mb | 66.9% | 86.9% | 45.4 ms +Mobilenet_V1_1.0_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_128_quant.tgz) | 4.3 Mb | 63.3% | 84.1% | 24.9 ms +Mobilenet_V1_1.0_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_160_quant.tgz) | 4.3 Mb | 66.9% | 86.7% | 37.4 ms +Mobilenet_V1_1.0_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_192_quant.tgz) | 4.3 Mb | 69.1% | 88.1% | 51.9 ms +Mobilenet_V1_1.0_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz) | 4.3 Mb | 70.0% | 89.0% | 70.2 ms +Mobilenet_V2_1.0_224_quant | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz) | 3.4 Mb | 70.8% | 89.9% | 53.4 ms +Inception_V1_quant | [paper](https://arxiv.org/abs/1409.4842), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/inception_v1_224_quant_20181026.tgz) | 6.4 Mb | 70.1% | 89.8% | 154.5 ms +Inception_V2_quant | [paper](https://arxiv.org/abs/1512.00567), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/inception_v2_224_quant_20181026.tgz) | 11 Mb | 73.5% | 91.4% | 235.0 ms +Inception_V3_quant | [paper](https://arxiv.org/abs/1806.08342),[tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz) | 23 Mb | 77.5% | 93.7% | 637 ms +Inception_V4_quant | [paper](https://arxiv.org/abs/1602.07261), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/inception_v4_299_quant_20181026.tgz) | 41 Mb | 79.5% | 93.9% | 1250.8 ms Note: The model files include both TF Lite FlatBuffer and Tensorflow frozen Graph. @@ -68,23 +68,23 @@ ResNet_V2_101 | [paper](https://arxiv.org/abs/1603.05027), [tflite&pb](h Inception_V3 | [paper](http://arxiv.org/abs/1512.00567), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v3_2018_04_27.tgz) | 95.3 Mb | 77.9% | 93.8% | 1433 ms | 1522 ms Inception_V4 | [paper](http://arxiv.org/abs/1602.07261), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz) | 170.7 Mb | 80.1% | 95.1% | 2986 ms | 3139 ms Inception_ResNet_V2 | [paper](https://arxiv.org/abs/1602.07261), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz) | 121.0 Mb | 77.5% | 94.0% | 2731 ms | 2926 ms -Mobilenet_V1_0.25_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128.tgz) | 1.9 Mb | 41.4% | 66.2% | 6.2 ms | 13.0 ms -Mobilenet_V1_0.25_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_160.tgz) | 1.9 Mb | 45.4% | 70.2% | 8.6 ms | 19.5 ms -Mobilenet_V1_0.25_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_192.tgz) | 1.9 Mb | 47.1% | 72.0% | 12.1 ms | 27.8 ms -Mobilenet_V1_0.25_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_224.tgz) | 1.9 Mb | 49.7% | 74.1% | 16.2 ms | 37.3 ms -Mobilenet_V1_0.50_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_128.tgz) | 5.3 Mb | 56.2% | 79.3% | 18.1 ms | 29.9 ms -Mobilenet_V1_0.50_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_160.tgz) | 5.3 Mb | 59.0% | 81.8% | 26.8 ms | 45.9 ms -Mobilenet_V1_0.50_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_192.tgz) | 5.3 Mb | 61.7% | 83.5% | 35.6 ms | 65.3 ms -Mobilenet_V1_0.50_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_224.tgz) | 5.3 Mb | 63.2% | 84.9% | 47.6 ms | 164.2 ms -Mobilenet_V1_0.75_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_128.tgz) | 10.3 Mb | 62.0% | 83.8% | 34.6 ms | 48.7 ms -Mobilenet_V1_0.75_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_160.tgz) | 10.3 Mb | 65.2% | 85.9% | 51.3 ms | 75.2 ms -Mobilenet_V1_0.75_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_192.tgz) | 10.3 Mb | 67.1% | 87.2% | 71.7 ms | 107.0 ms -Mobilenet_V1_0.75_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_224.tgz) | 10.3 Mb | 68.3% | 88.1% | 95.7 ms | 143.4 ms -Mobilenet_V1_1.0_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_128.tgz) | 16.9 Mb | 65.2% | 85.7% | 57.4 ms | 76.8 ms -Mobilenet_V1_1.0_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_160.tgz) | 16.9 Mb | 68.0% | 87.7% | 86.0 ms | 117.7 ms -Mobilenet_V1_1.0_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_192.tgz) | 16.9 Mb | 69.9% | 89.1% | 118.6 ms | 167.3 ms -Mobilenet_V1_1.0_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz) | 16.9 Mb | 71.0% | 89.9% | 160.1 ms | 224.3 ms -Mobilenet_V2_1.0_224 | [paper](https://arxiv.org/pdf/1801.04381.pdf), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224.tgz) | 14.0 Mb | 71.8% | 90.6% | 117 ms | +Mobilenet_V1_0.25_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128.tgz) | 1.9 Mb | 41.4% | 66.2% | 6.2 ms | 13.0 ms +Mobilenet_V1_0.25_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_160.tgz) | 1.9 Mb | 45.4% | 70.2% | 8.6 ms | 19.5 ms +Mobilenet_V1_0.25_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_192.tgz) | 1.9 Mb | 47.1% | 72.0% | 12.1 ms | 27.8 ms +Mobilenet_V1_0.25_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_224.tgz) | 1.9 Mb | 49.7% | 74.1% | 16.2 ms | 37.3 ms +Mobilenet_V1_0.50_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_128.tgz) | 5.3 Mb | 56.2% | 79.3% | 18.1 ms | 29.9 ms +Mobilenet_V1_0.50_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_160.tgz) | 5.3 Mb | 59.0% | 81.8% | 26.8 ms | 45.9 ms +Mobilenet_V1_0.50_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_192.tgz) | 5.3 Mb | 61.7% | 83.5% | 35.6 ms | 65.3 ms +Mobilenet_V1_0.50_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_224.tgz) | 5.3 Mb | 63.2% | 84.9% | 47.6 ms | 164.2 ms +Mobilenet_V1_0.75_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_128.tgz) | 10.3 Mb | 62.0% | 83.8% | 34.6 ms | 48.7 ms +Mobilenet_V1_0.75_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_160.tgz) | 10.3 Mb | 65.2% | 85.9% | 51.3 ms | 75.2 ms +Mobilenet_V1_0.75_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_192.tgz) | 10.3 Mb | 67.1% | 87.2% | 71.7 ms | 107.0 ms +Mobilenet_V1_0.75_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_224.tgz) | 10.3 Mb | 68.3% | 88.1% | 95.7 ms | 143.4 ms +Mobilenet_V1_1.0_128 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_128.tgz) | 16.9 Mb | 65.2% | 85.7% | 57.4 ms | 76.8 ms +Mobilenet_V1_1.0_160 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_160.tgz) | 16.9 Mb | 68.0% | 87.7% | 86.0 ms | 117.7 ms +Mobilenet_V1_1.0_192 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_192.tgz) | 16.9 Mb | 69.9% | 89.1% | 118.6 ms | 167.3 ms +Mobilenet_V1_1.0_224 | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz) | 16.9 Mb | 71.0% | 89.9% | 160.1 ms | 224.3 ms +Mobilenet_V2_1.0_224 | [paper](https://arxiv.org/pdf/1801.04381.pdf), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224.tgz) | 14.0 Mb | 71.8% | 90.6% | 117 ms | ### AutoML mobile models diff --git a/tensorflow/lite/g3doc/models/smart_reply/overview.md b/tensorflow/lite/g3doc/models/smart_reply/overview.md index b2363adcf48..abfcc8c2393 100644 --- a/tensorflow/lite/g3doc/models/smart_reply/overview.md +++ b/tensorflow/lite/g3doc/models/smart_reply/overview.md @@ -8,7 +8,7 @@ Our smart reply model generates reply suggestions based on chat messages. The suggestions are intended to be contextually relevant, one-touch responses that help the user to easily reply to an incoming message. -Download +Download starter model and labels ### Sample application diff --git a/tensorflow/lite/g3doc/performance/benchmarks.md b/tensorflow/lite/g3doc/performance/benchmarks.md index a51fdb40807..c7305209f69 100644 --- a/tensorflow/lite/g3doc/performance/benchmarks.md +++ b/tensorflow/lite/g3doc/performance/benchmarks.md @@ -46,7 +46,7 @@ Pixel xl | 0c | - Mobilenet_1.0_224(float) + Mobilenet_1.0_224(float) Pixel 2 123.3 ms @@ -57,7 +57,7 @@ Pixel xl | 0c | - Mobilenet_1.0_224 (quant) + Mobilenet_1.0_224 (quant) Pixel 2 65.4 ms @@ -130,14 +130,14 @@ modified to set `num_threads` to 1. - Mobilenet_1.0_224(float) + Mobilenet_1.0_224(float) iPhone 8 32.2 ms - Mobilenet_1.0_224 (quant) + Mobilenet_1.0_224 (quant) iPhone 8 24.4 ms diff --git a/tensorflow/lite/java/demo/app/build.gradle b/tensorflow/lite/java/demo/app/build.gradle index c353b2c25ca..fca18430fa5 100644 --- a/tensorflow/lite/java/demo/app/build.gradle +++ b/tensorflow/lite/java/demo/app/build.gradle @@ -60,8 +60,8 @@ dependencies { } def targetFolder = "src/main/assets" -def modelFloatDownloadUrl = "http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz" -def modelQuantDownloadUrl = "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz" +def modelFloatDownloadUrl = "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz" +def modelQuantDownloadUrl = "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz" def localCacheFloat = "build/intermediates/mobilenet_v1_1.0_224.tgz" def localCacheQuant = "build/intermediates/mmobilenet_v1_1.0_224_quant.tgz" diff --git a/tensorflow/lite/models/smartreply/g3doc/README.md b/tensorflow/lite/models/smartreply/g3doc/README.md index 1b8ff15196c..04439293337 100644 --- a/tensorflow/lite/models/smartreply/g3doc/README.md +++ b/tensorflow/lite/models/smartreply/g3doc/README.md @@ -62,8 +62,8 @@ and [research paper](https://arxiv.org/pdf/1708.00630). ## How to use this Model? We have provided a pre-built demo APK that you can download, install and test on -your phone ([demo APK -here](http://download.tensorflow.org/deps/tflite/SmartReplyDemo.apk)). +your phone +([demo APK here](https://storage.googleapis.com/download.tensorflow.org/deps/tflite/SmartReplyDemo.apk)). The On-Device Smart Reply demo App works in the following way: diff --git a/tensorflow/lite/tools/benchmark/ios/README.md b/tensorflow/lite/tools/benchmark/ios/README.md index 3a9ae27384c..5c772ac3fca 100644 --- a/tensorflow/lite/tools/benchmark/ios/README.md +++ b/tensorflow/lite/tools/benchmark/ios/README.md @@ -13,7 +13,7 @@ parameters like inputs to the model, type of inputs, number of iterations, number of threads. The default values in the JSON file are for the Mobilenet_1.0_224 model ([paper](https://arxiv.org/pdf/1704.04861.pdf), -[tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz)) +[tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz)) ## To build/install/run diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md index a90916cd1b9..34d6305725f 100644 --- a/tensorflow/tools/graph_transforms/README.md +++ b/tensorflow/tools/graph_transforms/README.md @@ -111,7 +111,7 @@ unsure, the tool can inspect the model and provide guesses about likely input and output nodes, as well as other information that's useful for debugging. Here's an example of how to use it on the [Inception V3 -graph](http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz): +graph](https://storage.googleapis.com/download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz): ```bash bazel build tensorflow/tools/graph_transforms:summarize_graph @@ -124,7 +124,7 @@ This section has small guides for some of the most frequently-used transformation pipelines, aimed at users who want to quickly accomplish one of these tasks. A lot of them will use the Inception V3 model for their examples, which can be downloaded from -[http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz](http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz). +[https://storage.googleapis.com/download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz](https://storage.googleapis.com/download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz). ### Optimizing for Deployment diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index f888e2d8b83..a22708b4016 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -787,8 +787,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): build_file = clean_dep("//third_party:tflite_mobilenet_float.BUILD"), sha256 = "2fadeabb9968ec6833bee903900dda6e61b3947200535874ce2fe42a8493abc0", urls = [ - "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz", - "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz", + "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz", + "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz", ], ) @@ -797,8 +797,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): build_file = clean_dep("//third_party:tflite_mobilenet_quant.BUILD"), sha256 = "d32432d28673a936b2d6281ab0600c71cf7226dfe4cdcef3012555f691744166", urls = [ - "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz", - "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz", + "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz", + "https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz", ], ) From 4ecac5677416f334a6cfe1d5c926477f85082aee Mon Sep 17 00:00:00 2001 From: Phillip Kravtsov Date: Wed, 24 Jul 2019 18:55:53 -0700 Subject: [PATCH 0537/3053] TF-TRT: ySimplified graph conversion functions --- .../tf2tensorrt/convert/convert_graph.cc | 38 +++++++++---------- .../tf2tensorrt/convert/convert_graph.h | 21 ++++++---- .../tf2tensorrt/convert/convert_nodes.h | 4 +- .../tf2tensorrt/convert/convert_nodes_test.cc | 10 +++-- .../tf2tensorrt/kernels/trt_engine_op_test.cc | 7 ++-- 5 files changed, 43 insertions(+), 37 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index 71e754af38f..e581ffdeb65 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -534,14 +534,11 @@ Status CreateTRTNode(const ConversionParams& params, return Status::OK(); } -// Function to construct a funcdef from the segment and add it to the graph. -Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, - Graph* segment_graph) { +Status ConvertSegmentToGraph(const GraphDef& segment, Graph* segment_graph) { // segment_graph is a graph for the segment, to be modified by this function // graph is the input graph to be optimized by TRT. GraphConstructorOptions gcopts; TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(gcopts, segment, segment_graph)); - /* std::map io_nodes; int num_inputs = 0; for (auto n : segment_graph->op_nodes()) { @@ -616,13 +613,13 @@ Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, } segment_graph->RemoveNode(node); } - */ return Status::OK(); } -Status RegisterModifiedGraphToFunctionLibrary(Graph* segment_graph, Graph* graph, - FunctionDefLibrary fdeflib, - const string& engine_name) { + +Status RegisterGraphToFunctionLibrary(Graph* segment_graph, Graph* graph, + FunctionDefLibrary fdeflib, + const string& engine_name) { auto native_segment = fdeflib.add_function(); TF_RETURN_IF_ERROR(GraphToFunctionDef( *segment_graph, StrCat(engine_name, "_native_segment"), native_segment)); @@ -641,6 +638,16 @@ Status RegisterModifiedGraphToFunctionLibrary(Graph* segment_graph, Graph* graph return Status::OK(); } +Status RegisterSegmentToFunctionLibrary(Graph* graph, const GraphDef& segment, + Graph* segment_graph, + string engine_name) { + GraphConstructorOptions gcopts; + TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(gcopts, segment, segment_graph)); + FunctionDefLibrary fdeflib; + return RegisterGraphToFunctionLibrary(segment_graph, graph, fdeflib, + engine_name); +} + std::pair GetDeviceAndAllocator(const ConversionParams& params, const EngineInfo& engine) { int cuda_device_id = -1; @@ -760,19 +767,10 @@ Status ConvertAfterShapes(const ConversionParams& params) { curr_engine.maximum_cached_engines = params.max_cached_engines; Graph segment_graph(flib); - status = ModifyGraphForFunctionDef(&graph, curr_engine.segment_graph_def, - &segment_graph); + status = RegisterSegmentToFunctionLibrary(&graph, + curr_engine.segment_graph_def, &segment_graph, curr_engine.engine_name); if (!status.ok()) { - LOG(WARNING) << "Failed to modify graph as a function " << t << ": " - << status; - continue; - } - FunctionDefLibrary fdeflib; - status = RegisterModifiedGraphToFunctionLibrary(&segment_graph, &graph, fdeflib, - curr_engine.engine_name); - - if (!status.ok()) { - LOG(WARNING) << "Failed to register segment graphdef as a function " << t + LOG(WARNING) << "Failed to register segment graphdef to the library " << t << ": " << status; continue; } diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h index 476cedaa180..fe56124c31a 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.h @@ -56,16 +56,21 @@ Status ConvertAfterShapes(const ConversionParams& params); std::pair GetDeviceAndAllocator(const ConversionParams& params, const EngineInfo& engine); -// Method to replace Placeholder and identity nodes with Arg and Retval. -// graph is the full graph, while segment_graph is only the segment. -Status ModifyGraphForFunctionDef(Graph* graph, const GraphDef& segment, - Graph* segment_graph); +// Method to register a segment to the function library. The graph +// should contain _Arg/_Retval nodes. +Status RegisterSegmentToFunctionLibrary(Graph* graph, const GraphDef& segment, + Graph* segment_graph, + string engine_name); -// Method that registers the segment graph to a function library. +// Helper method that registers the segment graph to the given function library. // graph is the full graph, while segment_graph is only the segment. -Status RegisterModifiedGraphToFunctionLibrary(Graph* segment_graph, Graph* graph, - FunctionDefLibrary fdeflib, - const string& engine_name); +Status RegisterGraphToFunctionLibrary(Graph* segment_graph, Graph* graph, + FunctionDefLibrary fdeflib, + const string& engine_name); +// Converts a segment graphdef to a graph, replacing input and output ops to +// Arg and Retval respectively. Used in testing. +Status ConvertSegmentToGraph(const GraphDef& segment, Graph* segment_graph); + } // namespace convert } // namespace tensorrt } // namespace tensorflow diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h index bac845ce2c4..c7331b62a68 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h @@ -118,8 +118,8 @@ struct EngineInfo { bool use_calibration; }; -// Constructs a graphdef from the segment in the given graph. Adds placeholder -// nodes for input edges (InputPH_*) and identity nodes for output edges +// Constructs a graphdef from the segment in the given graph. Adds _Arg +// nodes for input edges (InputPH_*) and _Retval nodes for output edges // (OutputPH_*). This function needs to be called before TensorRT nodes // inserted in order to correctly get sizes from the original graph. // diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc index b6a3587005c..effec185dfe 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc @@ -1158,7 +1158,7 @@ class ConvertGraphDefToEngineTest : public ::testing::Test { int batch_size = -1; for (const NodeDef& node : gdef.node()) { absl::string_view node_name(node.name()); - if (absl::ConsumePrefix(&node_name, kInputPHName)) { + if (absl::ConsumePrefix(&node_name, IONamePrefixes::kInputPHName)) { int port = -1; EXPECT_TRUE(absl::SimpleAtoi(node_name, &port)) << node.name(); if (input_shapes.size() < port + 1) input_shapes.resize(port + 1); @@ -1188,11 +1188,13 @@ class ConvertGraphDefToEngineTest : public ::testing::Test { TEST_F(ConvertGraphDefToEngineTest, IdentityGraph) { Scope s = Scope::NewRootScope(); - auto input = ops::Placeholder(s.WithOpName(StrCat(kInputPHName, 0)), DT_FLOAT, - ops::Placeholder::Shape({1, 1})); + auto input = ops::Placeholder( + s.WithOpName(StrCat(IONamePrefixes::kInputPHName, 0)), DT_FLOAT, + ops::Placeholder::Shape({1, 1})); auto output = ops::Identity(s.WithOpName("identity1"), input); output = ops::Identity(s.WithOpName("identity2"), output); - output = ops::Identity(s.WithOpName(StrCat(kOutputPHName, 0)), output); + output = ops::Identity( + s.WithOpName(StrCat(IONamePrefixes::kOutputPHName, 0)), output); // If the converter marks the input tensor as output tensor, the conversion // below will fail with: // > TensorRTOutputPH_0 cannot be both input and output diff --git a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc index b5056fa5b91..4d60b24396b 100644 --- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc +++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op_test.cc @@ -24,6 +24,7 @@ limitations under the License. #include #include "tensorflow/cc/ops/standard_ops.h" #include "tensorflow/compiler/tf2tensorrt/convert/convert_graph.h" +#include "tensorflow/compiler/tf2tensorrt/convert/utils.h" #include "tensorflow/compiler/tf2tensorrt/utils/calibration_resource.h" #include "tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h" #include "tensorflow/core/framework/fake_input.h" @@ -47,6 +48,7 @@ using ::testing::ElementsAre; class TRTEngineOpTestBase : public OpsTestBase { public: + void AddSimpleTrtOp(DataType dtype, int max_cached_engines_count = 1) { // Create the GPU device. std::unique_ptr device( @@ -65,9 +67,8 @@ class TRTEngineOpTestBase : public OpsTestBase { const string func_name = "myop_native_segment"; Graph* graph = s.graph(); Graph segment_graph(graph->flib_def()); - TF_ASSERT_OK(convert::ModifyGraphForFunctionDef( - graph, graph_def, &segment_graph)); - TF_ASSERT_OK(convert::RegisterModifiedGraphToFunctionLibrary(&segment_graph, graph, + TF_ASSERT_OK(convert::ConvertSegmentToGraph(graph_def, &segment_graph)); + TF_ASSERT_OK(convert::RegisterGraphToFunctionLibrary(&segment_graph, graph, flib_def_->ToProto(), "myop")); PartialTensorShape shape({-1, -1}); From 85ccca4ad1cd710253144180b6570beb2675acb2 Mon Sep 17 00:00:00 2001 From: Nupur Garg Date: Wed, 24 Jul 2019 18:45:06 -0700 Subject: [PATCH 0538/3053] Minor cleanup to simplify convert_to_constants_test. PiperOrigin-RevId: 259862678 --- .../framework/convert_to_constants_test.py | 142 ++++++------------ 1 file changed, 44 insertions(+), 98 deletions(-) diff --git a/tensorflow/python/framework/convert_to_constants_test.py b/tensorflow/python/framework/convert_to_constants_test.py index f962d5ebe47..9f5050d5f62 100644 --- a/tensorflow/python/framework/convert_to_constants_test.py +++ b/tensorflow/python/framework/convert_to_constants_test.py @@ -47,6 +47,24 @@ from tensorflow.python.util import nest class VariablesToConstantsTest(test.TestCase): + def _freezeModel(self, model): + """Freezes the model. + + Args: + model: Function. + + Returns: + root: AutoTrackable object with original ConcreteFunction. + output_func: frozen ConcreteFunction. + """ + root = tracking.AutoTrackable() + root.f = model + input_func = root.f.get_concrete_function() + + output_func = convert_to_constants.convert_variables_to_constants_v2( + input_func, lower_control_flow=False) + return root, output_func + def _hasStatefulPartitionedCallOp(self, graph_def): """Determines if a StatefulPartitionedCall op exists in the graph.""" for node in graph_def.node: @@ -60,6 +78,11 @@ class VariablesToConstantsTest(test.TestCase): def _testConvertedFunction(self, obj, func, converted_concrete_func, input_data): + # Ensure the converted graph has no variables and no function calls. + constant_graph_def = converted_concrete_func.graph.as_graph_def() + self.assertEqual(0, self._getNumVariables(constant_graph_def)) + self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def)) + # Check that the converted ConcreteFunction produces the same result as the # original Function. expected_value = nest.flatten(func(**input_data)) @@ -104,10 +127,6 @@ class VariablesToConstantsTest(test.TestCase): output_func = convert_to_constants.convert_variables_to_constants_v2( input_func) - constant_graph_def = output_func.graph.as_graph_def() - self.assertEqual(0, self._getNumVariables(constant_graph_def)) - self.assertFalse(constant_graph_def.library.function) - self._testConvertedFunction(root, root.f, output_func, input_data) @test_util.run_v2_only @@ -125,10 +144,6 @@ class VariablesToConstantsTest(test.TestCase): output_func = convert_to_constants.convert_variables_to_constants_v2( input_func) - constant_graph_def = output_func.graph.as_graph_def() - self.assertEqual(0, self._getNumVariables(constant_graph_def)) - self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def)) - self._testConvertedFunction(root, root.f, output_func, input_data) @test_util.run_v2_only @@ -146,10 +161,6 @@ class VariablesToConstantsTest(test.TestCase): output_func = convert_to_constants.convert_variables_to_constants_v2( input_func) - constant_graph_def = output_func.graph.as_graph_def() - self.assertEqual(0, self._getNumVariables(constant_graph_def)) - self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def)) - self._testConvertedFunction(root, root.f, output_func, input_data) @test_util.run_v2_only @@ -172,10 +183,6 @@ class VariablesToConstantsTest(test.TestCase): output_func = convert_to_constants.convert_variables_to_constants_v2( input_func) - constant_graph_def = output_func.graph.as_graph_def() - self.assertEqual(0, self._getNumVariables(constant_graph_def)) - self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def)) - self._testConvertedFunction(root, root.f, output_func, input_data) @test_util.run_v2_only @@ -209,15 +216,12 @@ class VariablesToConstantsTest(test.TestCase): output_func = convert_to_constants.convert_variables_to_constants_v2( input_func) - constant_graph_def = output_func.graph.as_graph_def() - self.assertEqual(0, self._getNumVariables(constant_graph_def)) - self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def)) - self._testConvertedFunction(root, root.add, output_func, input_data) @test_util.run_v2_only def testKerasModel(self): - input_data = constant_op.constant(1., shape=[1, 1]) + """Test a basic Keras model with Variables.""" + input_data = {"x": constant_op.constant(1., shape=[1, 1])} # Create a simple Keras model. x = [-1, 0, 1, 2, 3, 4] @@ -228,26 +232,14 @@ class VariablesToConstantsTest(test.TestCase): model.compile(optimizer="sgd", loss="mean_squared_error") model.fit(x, y, epochs=1) - # Get the concrete function from the Keras model. - @def_function.function + @def_function.function(input_signature=[ + tensor_spec.TensorSpec(shape=[1, 1], dtype=dtypes.float32) + ]) def to_save(x): return model(x) - input_func = to_save.get_concrete_function(input_data) - - variable_graph_def = input_func.graph.as_graph_def() - self.assertEqual(2, self._getNumVariables(variable_graph_def)) - - output_func = convert_to_constants.convert_variables_to_constants_v2( - input_func) - constant_graph_def = output_func.graph.as_graph_def() - self.assertEqual(0, self._getNumVariables(constant_graph_def)) - self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def)) - - # Check value. - expected_value = to_save(input_data) - actual_value = nest.flatten(output_func(input_data)) - self.assertEqual(expected_value.numpy(), actual_value) + root, output_func = self._freezeModel(to_save) + self._testConvertedFunction(root, root.f, output_func, input_data) def _singleMetaGraphSavedModel(self): export_graph = ops.Graph() @@ -276,21 +268,20 @@ class VariablesToConstantsTest(test.TestCase): @test_util.run_v2_only def testRefVariableImport(self): + """Test a model with 1.X ReferenceVariables.""" + input_data = {"start": constant_op.constant(1., shape=[1, 1])} + saved = self._singleMetaGraphSavedModel() imported = load(saved) fn = imported.signatures["serving_default"] - output_func = convert_to_constants.convert_variables_to_constants_v2(fn) - constant_graph_def = output_func.graph.as_graph_def() - self.assertEqual(0, self._getNumVariables(constant_graph_def)) - self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def)) - input_data = {"start": constant_op.constant(1., shape=[1, 1])} + output_func = convert_to_constants.convert_variables_to_constants_v2(fn) root = tracking.AutoTrackable() self._testConvertedFunction(root, fn, output_func, input_data) @test_util.run_v2_only def testIf(self): - """Test whether If op freezes correctly.""" + """Test a model with the If op.""" input_data = { "x": constant_op.constant([1., 2.], shape=[1, 2]), "b": constant_op.constant(True) @@ -312,22 +303,12 @@ class VariablesToConstantsTest(test.TestCase): return control_flow_ops.cond( b, true_fn=lambda: true_fn(x), false_fn=lambda: false_fn(x)) - root = tracking.AutoTrackable() - root.f = model - input_func = root.f.get_concrete_function() - input_func(**input_data) - - output_func = convert_to_constants.convert_variables_to_constants_v2( - input_func, lower_control_flow=False) - constant_graph_def = output_func.graph.as_graph_def() - self.assertEqual(0, self._getNumVariables(constant_graph_def)) - self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def)) - + root, output_func = self._freezeModel(model) self._testConvertedFunction(root, root.f, output_func, input_data) @test_util.run_v2_only def testStatelessIf(self): - """Test whether StatelessIf op freezes correctly.""" + """Test a model with the StatelessIf op.""" input_data = {"b": constant_op.constant(True)} x = constant_op.constant([1., 2.], shape=[1, 2], name="x") @@ -343,21 +324,12 @@ class VariablesToConstantsTest(test.TestCase): def model(b): return cond_v2.cond_v2(b, true_fn, false_fn) - root = tracking.AutoTrackable() - root.f = model - input_func = root.f.get_concrete_function() - input_func(**input_data) - - output_func = convert_to_constants.convert_variables_to_constants_v2( - input_func, lower_control_flow=False) - constant_graph_def = output_func.graph.as_graph_def() - self.assertEqual(0, self._getNumVariables(constant_graph_def)) - self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def)) - + root, output_func = self._freezeModel(model) self._testConvertedFunction(root, root.f, output_func, input_data) @test_util.run_v2_only def testStaticRnn(self): + """Test a StaticRnn containing If ops.""" input_data = { "x": constant_op.constant( @@ -374,20 +346,12 @@ class VariablesToConstantsTest(test.TestCase): return rnn.static_rnn( cell, seq, dtype=dtypes.float32, sequence_length=[1]) - root = tracking.AutoTrackable() - root.f = model - input_func = root.f.get_concrete_function() - - output_func = convert_to_constants.convert_variables_to_constants_v2( - input_func, lower_control_flow=False) - constant_graph_def = output_func.graph.as_graph_def() - self.assertEqual(0, self._getNumVariables(constant_graph_def)) - self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def)) - + root, output_func = self._freezeModel(model) self._testConvertedFunction(root, root.f, output_func, input_data) @test_util.run_v2_only def testLoop(self): + """Test a While loop.""" input_data = {"x": constant_op.constant([1., 2., 3., 4.], shape=[2, 2])} weights = variables.Variable([[0.1, 0.2], [0.3, 0.4]], dtype=dtypes.float32) @@ -404,21 +368,12 @@ class VariablesToConstantsTest(test.TestCase): def model(x): return control_flow_ops.while_loop(condition, body, [x]) - root = tracking.AutoTrackable() - root.f = model - input_func = root.f.get_concrete_function() - input_func(**input_data) - - output_func = convert_to_constants.convert_variables_to_constants_v2( - input_func, lower_control_flow=False) - constant_graph_def = output_func.graph.as_graph_def() - self.assertEqual(0, self._getNumVariables(constant_graph_def)) - self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def)) - + root, output_func = self._freezeModel(model) self._testConvertedFunction(root, root.f, output_func, input_data) @test_util.run_v2_only def testDynamicRnn(self): + """Test a DynamicRnn containing While loops.""" input_data = { "x": constant_op.constant( @@ -434,16 +389,7 @@ class VariablesToConstantsTest(test.TestCase): def model(x): return rnn.dynamic_rnn(cell, x, dtype=dtypes.float32) - root = tracking.AutoTrackable() - root.f = model - input_func = root.f.get_concrete_function() - - output_func = convert_to_constants.convert_variables_to_constants_v2( - input_func, lower_control_flow=False) - constant_graph_def = output_func.graph.as_graph_def() - self.assertEqual(0, self._getNumVariables(constant_graph_def)) - self.assertFalse(self._hasStatefulPartitionedCallOp(constant_graph_def)) - + root, output_func = self._freezeModel(model) self._testConvertedFunction(root, root.f, output_func, input_data) From 4c5910487ebdd30f29e4ac4741f884e09d63d23e Mon Sep 17 00:00:00 2001 From: Yongfeng Gu Date: Wed, 24 Jul 2019 22:39:12 -0400 Subject: [PATCH 0539/3053] Opt out DEVICE_GPU_XLA_JIT and DEVICE_XLA_GPU from ResizeNearestNeighborOp, ResizeBilinearOp, and ResizeBilinearGradOp, because the dilation-based approach may introduce convolutions too large for GPU to handle. --- .../tf2xla/kernels/image_resize_ops.cc | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc index b309541a864..04a37a433b4 100644 --- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc @@ -587,7 +587,13 @@ void ResizeNearestNeighborOp::Compile(XlaOpKernelContext* ctx) { GeneralCompile(ctx, align_corners_, is_kernel_bilinear_); } -REGISTER_XLA_OP(Name("ResizeNearestNeighbor").CompileTimeConstantInput("size"), +REGISTER_XLA_OP(Name("ResizeNearestNeighbor") + .Device(DEVICE_CPU_XLA_JIT) + .CompileTimeConstantInput("size"), + ResizeNearestNeighborOp); +REGISTER_XLA_OP(Name("ResizeNearestNeighbor") + .Device(DEVICE_XLA_CPU) + .CompileTimeConstantInput("size"), ResizeNearestNeighborOp); ResizeBilinearOp::ResizeBilinearOp(OpKernelConstruction* ctx) @@ -604,7 +610,13 @@ void ResizeBilinearOp::Compile(XlaOpKernelContext* ctx) { GeneralCompile(ctx, align_corners_, is_kernel_bilinear_); } -REGISTER_XLA_OP(Name("ResizeBilinear").CompileTimeConstantInput("size"), +REGISTER_XLA_OP(Name("ResizeBilinear") + .Device(DEVICE_CPU_XLA_JIT) + .CompileTimeConstantInput("size"), + ResizeBilinearOp); +REGISTER_XLA_OP(Name("ResizeBilinear") + .Device(DEVICE_XLA_CPU) + .CompileTimeConstantInput("size"), ResizeBilinearOp); ResizeBilinearGradOp::ResizeBilinearGradOp(OpKernelConstruction* ctx) @@ -698,6 +710,7 @@ void ResizeBilinearGradOp::Compile(XlaOpKernelContext* ctx) { ctx->SetOutput(0, output); } -REGISTER_XLA_OP(Name("ResizeBilinearGrad"), ResizeBilinearGradOp); +REGISTER_XLA_OP(Name("ResizeBilinearGrad").Device(DEVICE_CPU_XLA_JIT), ResizeBilinearGradOp); +REGISTER_XLA_OP(Name("ResizeBilinearGrad").Device(DEVICE_XLA_CPU), ResizeBilinearGradOp); } // namespace tensorflow From 97029c72c7d123a1c0a52e601e522e27f40712b6 Mon Sep 17 00:00:00 2001 From: Edward Loper Date: Wed, 24 Jul 2019 20:30:38 -0700 Subject: [PATCH 0540/3053] Fix type-checking bug from PR #30615. That PR checked if a value was a unicode string using isinstance(debug_info_str, str). But in Python 2.x, `str` is the byte-string type. So check against `bytes` instead. PiperOrigin-RevId: 259873125 --- tensorflow/lite/python/BUILD | 1 - tensorflow/lite/python/convert.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD index df7c07ff5d4..9316da8e94c 100644 --- a/tensorflow/lite/python/BUILD +++ b/tensorflow/lite/python/BUILD @@ -111,7 +111,6 @@ py_test( srcs = ["lite_v2_test.py"], srcs_version = "PY2AND3", tags = [ - "no_oss", "no_windows", ], deps = [ diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py index 328e44ec984..9fe8b25c0e6 100644 --- a/tensorflow/lite/python/convert.py +++ b/tensorflow/lite/python/convert.py @@ -161,7 +161,7 @@ def toco_convert_protos(model_flags_str, # Some of the subtests within the "convert_test" unit-test fail # with the error shown above. So watch out for that scenario and # convert debug_info_str to bytes where needed - if isinstance(debug_info_str, str): + if not isinstance(debug_info_str, bytes): fp_debug.write(debug_info_str.encode("utf-8")) else: fp_debug.write(debug_info_str) From 2494aa9c98b10cd4a16458f0934e11c15a658b63 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Wed, 24 Jul 2019 21:09:06 -0700 Subject: [PATCH 0541/3053] Use Tensor in convert_tensor instead of TensorProto. The main runtime entry uses Tensors, so use that instead. Removed the TensorProto parts (converted to Tensor instead on TensorProto path to enable reuse). Also update three tests that changed due to import and one that failed to convert TensorProto to proto. PiperOrigin-RevId: 259877372 --- .../graph-empty-tensor-content.pbtxt | 2 +- .../graphdef2mlir/graph-version-info.pbtxt | 1 - .../tests/graphdef2mlir/string-attr.pbtxt | 2 +- .../mlir/tensorflow/utils/convert_tensor.cc | 183 ++++-------------- 4 files changed, 45 insertions(+), 143 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt index c023c7e6658..12d05c1195f 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-empty-tensor-content.pbtxt @@ -3,7 +3,7 @@ # This test is intended to verify the tensor_content field on import of an empty # tensor. # CHECK: tf.Const -# CHECK-SAME: value = opaque<"tf", "0x746674656E736F722464747970653A2044545F464C4F41542074656E736F725F7368617065207B2064696D207B2073697A653A2031207D207D"> +# CHECK-SAME: value = dense<0.000000e+00> node { name: "Const" diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-version-info.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-version-info.pbtxt index 5f8e7854161..20bf33d7fb2 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-version-info.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-version-info.pbtxt @@ -29,7 +29,6 @@ node { size: 2 } } - tensor_content: "\350\251\242>\276\335r?" } } } diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt index c6f0730070f..a03753184b1 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/string-attr.pbtxt @@ -42,6 +42,6 @@ versions { } # CHECK: func @main() { -# CHECK-NEXT: %0:2 = "_tf.Const"() {_output_shapes = ["tfshape$dim { size: 3 }"], device = "", dtype = "tfdtype$DT_STRING", name = "save/SaveV2/shape_and_slices", value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E472074656E736F725F7368617065207B2064696D207B2073697A653A2033207D207D20737472696E675F76616C3A20222220737472696E675F76616C3A20222220737472696E675F76616C3A202222"> : tensor<3x!tf.string>} : () -> (tensor<3x!tf.string>, !_tf.control) +# CHECK-NEXT: %0:2 = "_tf.Const"() {_output_shapes = ["tfshape$dim { size: 3 }"], device = "", dtype = "tfdtype$DT_STRING", name = "save/SaveV2/shape_and_slices", value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E472074656E736F725F7368617065207B2064696D207B2073697A653A2033207D207D2074656E736F725F636F6E74656E743A20225C3030305C3030305C30303022"> : tensor<3x!tf.string>} : () -> (tensor<3x!tf.string>, !_tf.control) # CHECK-NEXT: return # CHECK-NEXT: } diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc index 380d1253370..f8b2ea44930 100644 --- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc +++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc @@ -35,7 +35,6 @@ limitations under the License. #include "tensorflow/core/framework/tensor_shape.pb.h" #include "tensorflow/core/framework/types.pb.h" #include "tensorflow/core/lib/core/errors.h" -#include "tensorflow/core/platform/cord.h" #include "tensorflow/core/platform/protobuf.h" #include "tensorflow/stream_executor/lib/statusor.h" @@ -57,6 +56,14 @@ using mlir::SplatElementsAttr; using mlir::Type; using tensorflow::errors::InvalidArgument; +void ConvertToMlirShape(const TensorShape& input_shape, + llvm::SmallVectorImpl* shape) { + shape->reserve(input_shape.dims()); + for (const auto& d : input_shape) { + shape->push_back(d.size); + } +} + Status ConvertToMlirShape(const TensorShapeProto& input_shape, llvm::SmallVectorImpl* shape) { shape->reserve(input_shape.dim_size()); @@ -70,13 +77,16 @@ Status ConvertToMlirShape(const TensorShapeProto& input_shape, return Status::OK(); } -// Converts an TensorFlow tensor proto to an MLIR opaque elements attribute. -StatusOr ConvertToOpaqueElementsAttr( - const TensorProto& input_tensor, ShapedType type, Builder* builder) { +// Converts a TensorFlow tensor to an MLIR opaque elements attribute. +StatusOr ConvertToOpaqueElementsAttr(const Tensor& input_tensor, + ShapedType type, + Builder* builder) { + TensorProto tensor_proto; + input_tensor.AsProtoTensorContent(&tensor_proto); // TODO(shpeisman): restructure code to reuse dialect pointer across calls. auto* dialect = builder->getContext()->getRegisteredDialect("tf"); return builder->getOpaqueElementsAttr( - dialect, type, mangling_util::MangleTensor(input_tensor)); + dialect, type, mangling_util::MangleTensor(tensor_proto)); } // Template predicate that provides a constant member `value` equal to true if @@ -101,154 +111,45 @@ struct IsBatchCopyable< std::numeric_limits::digits == std::numeric_limits::digits; }; -// Converts an TensorFlow tensor proto to an MLIR dense elements attribute. -// To save the memory held by the attribute, the value is casted to the -// specified type. -template -typename std::enable_if::value, - StatusOr>::type -ConvertToDenseElementsAttr( - const tensorflow::protobuf::RepeatedField& values, ShapedType type, - Builder* builder) { - return mlir::DenseElementsAttr::get( - type, llvm::makeArrayRef(values.data(), values.size())); -} - -template -typename std::enable_if::value, - StatusOr>::type -ConvertToDenseElementsAttr( - const tensorflow::protobuf::RepeatedField& values, ShapedType type, - Builder* builder) { - std::vector buff; - buff.reserve(values.size()); - for (auto value : values) { - buff.push_back(value); - } - return mlir::DenseElementsAttr::get(type, llvm::makeArrayRef(buff)); -} - -// Convert a TensorFlow tensor from its raw serialization into a -// DenseElementAttr. This is a wrapper around mlir::DenseElementsAttr that -// creates a temporary copy of the data for satisfying strict aliasing -// defensively. TODO(aminim): this extra copy should not be needed, -// DenseElementAttr will perform a similar copy internally. -// Template parameter `T` must match the element type of the `type` argument -// (this is checked in DenseElementsAttr::get()). +// Converts a TensorFlow tensor into an MLIR elements attribute. template -mlir::DenseElementsAttr ConvertToDenseElementsAttr(const absl::Cord& values, - ShapedType type, - Builder* builder) { - DCHECK_EQ((values.size() % sizeof(T)), 0) - << "unexpected size vs elt type mismatch"; - int n_elements = values.size() / sizeof(T); - auto data = absl::make_unique(n_elements); - // This assumes that the endianess conversion was handled when loading the - // tensor in memory. - values.CopyToArray(reinterpret_cast(data.get())); +StatusOr ConvertFlatTensor(const Tensor& input_tensor, + ShapedType type, Builder* builder) { + auto arr = input_tensor.flat(); return mlir::DenseElementsAttr::get( - type, llvm::makeArrayRef(data.get(), n_elements)); + type, llvm::makeArrayRef(arr.data(), arr.size())); } -// Converts an TensorFlow tensor proto with DT_FLOAT data type into an MLIR +// Converts a TensorFlow tensor proto with DT_BOOL data type into an MLIR // elements attribute. -StatusOr ConvertFloatTensor(const TensorProto& input_tensor, - ShapedType type, Builder* builder) { - // When the repeated "float_val" field only has one element, it is converted - // to a splat elements attribute; When it has more than one element, it is - // converted to a dense elements attribute; otherwise, convert the whole - // tensor to an opaque elements attribute if the "tensor_content" field is - // set. - auto repeated_val_size = input_tensor.float_val_size(); - if (repeated_val_size == 1 || repeated_val_size == type.getNumElements()) { - return ConvertToDenseElementsAttr(input_tensor.float_val(), - type, builder); - } - auto raw_data = input_tensor.tensor_content(); - if (raw_data.size() == type.getSizeInBits() / 8) - return ConvertToDenseElementsAttr(raw_data, type, builder); - return ConvertToOpaqueElementsAttr(input_tensor, type, builder); -} - -// Converts an TensorFlow tensor proto with DT_INT32, DT_INT16, DT_INT8, -// DT_UINT8, DT_QUINT8 data type into an MLIR elements attribute. -template -StatusOr ConvertIntTensor(const TensorProto& input_tensor, - ShapedType type, Builder* builder) { - // When the repeated "int_val" field only has one element, it is converted to - // a splat elements attribute; When it has more than one element, it is - // converted to a dense elements attribute; otherwise, convert the whole - // tensor to an opaque elements attribute if the "tensor_content" field is - // set. - auto repeated_val_size = input_tensor.int_val_size(); - if (repeated_val_size == 1 || repeated_val_size == type.getNumElements()) { - return ConvertToDenseElementsAttr(input_tensor.int_val(), type, - builder); - } - auto raw_data = input_tensor.tensor_content(); - if (raw_data.size() == type.getSizeInBits() / 8) - return ConvertToDenseElementsAttr(raw_data, type, builder); - - return ConvertToOpaqueElementsAttr(input_tensor, type, builder); -} - -// Converts an TensorFlow tensor proto with DT_INT64 data type into an MLIR -// elements attribute. -StatusOr ConvertInt64Tensor(const TensorProto& input_tensor, - ShapedType type, Builder* builder) { - // When the repeated "int64_val" field only has one element, it is converted - // to a splat elements attribute; When it has more than one element, it is - // converted to a dense elements attribute; otherwise, convert the whole - // tensor to an opaque elements attribute if the "tensor_content" field is - // set. - auto repeated_val_size = input_tensor.int64_val_size(); - if (repeated_val_size == 1 || repeated_val_size == type.getNumElements()) { - return ConvertToDenseElementsAttr(input_tensor.int64_val(), type, - builder); - } - auto raw_data = input_tensor.tensor_content(); - if (raw_data.size() == type.getSizeInBits() / 8) - return ConvertToDenseElementsAttr(raw_data, type, builder); - return ConvertToOpaqueElementsAttr(input_tensor, type, builder); -} - -// Converts an TensorFlow tensor proto with DT_BOOL data type into an MLIR -// elements attribute. -StatusOr ConvertBoolTensor(const TensorProto& input_tensor, +StatusOr ConvertBoolTensor(const Tensor& input_tensor, ShapedType type, Builder* builder) { // When the repeated "bool_val" field only has one element, it is converted to // a splat elements attribute; When it has more than one element, it is // converted to a dense elements attribute; otherwise, convert the whole // tensor to an opaque elements attribute if the "tensor_content" field is // set. - auto repeated_val_size = input_tensor.bool_val_size(); - if (repeated_val_size == 1 || repeated_val_size == type.getNumElements()) { - const auto& proto = input_tensor.bool_val(); - return mlir::DenseElementsAttr::get( - type, llvm::makeArrayRef(proto.data(), proto.size())); - } return ConvertToOpaqueElementsAttr(input_tensor, type, builder); } -StatusOr ConvertTensorProto(const TensorProto& input_tensor, - Builder* builder) { +StatusOr ConvertTensor(const Tensor& input_tensor, + Builder* builder) { const auto& input_dtype = input_tensor.dtype(); - const auto& input_shape = input_tensor.tensor_shape(); + const auto& input_shape = input_tensor.shape(); Type elt_type; TF_RETURN_IF_ERROR(ConvertDataType(input_dtype, *builder, &elt_type)); SmallVector shape; - TF_RETURN_IF_ERROR(ConvertToMlirShape(input_shape, &shape)); + ConvertToMlirShape(input_shape, &shape); auto type = builder->getTensorType(shape, elt_type); // TODO(fengliuai): customize the conversions for more types. switch (input_dtype) { case DT_FLOAT: - return ConvertFloatTensor(input_tensor, type, builder); + return ConvertFlatTensor(input_tensor, type, builder); case DT_INT32: - return ConvertIntTensor(input_tensor, type, builder); + return ConvertFlatTensor(input_tensor, type, builder); case DT_INT64: - return ConvertInt64Tensor(input_tensor, type, builder); + return ConvertFlatTensor(input_tensor, type, builder); case DT_BOOL: return ConvertBoolTensor(input_tensor, type, builder); default: @@ -259,17 +160,19 @@ StatusOr ConvertTensorProto(const TensorProto& input_tensor, // calls. auto* dialect = builder->getContext()->getRegisteredDialect("tf"); + TensorProto tensor_proto; + input_tensor.AsProtoTensorContent(&tensor_proto); return builder->getOpaqueElementsAttr( - dialect, type, mangling_util::MangleTensor(input_tensor)); + dialect, type, mangling_util::MangleTensor(tensor_proto)); } } -StatusOr ConvertTensor(const Tensor& input_tensor, - mlir::Builder* builder) { - TensorProto input_proto; - // This decodes the tensor content into a proper proto field. - input_tensor.AsProtoField(&input_proto); - return ConvertTensorProto(input_proto, builder); +StatusOr ConvertTensorProto(const TensorProto& input_tensor, + Builder* builder) { + Tensor t; + if (!t.FromProto(input_tensor)) + return InvalidArgument("Failed to parse input_tensor."); + return ConvertTensor(t, builder); } Status ConvertToTensorShapeProto(ArrayRef shape, @@ -280,7 +183,7 @@ Status ConvertToTensorShapeProto(ArrayRef shape, return Status::OK(); } -// Converts an MLIR opaque elements attribute to an TensorFlow tensor proto. +// Converts an MLIR opaque elements attribute to a TensorFlow tensor proto. Status ConvertOpaqueElementsAttr(const ElementsAttr attr, TensorProto* output_tensor) { if (attr.isa()) { @@ -291,7 +194,7 @@ Status ConvertOpaqueElementsAttr(const ElementsAttr attr, return InvalidArgument("Unexpected elements attribute type from MLIR."); } -// Converts an MLIR elements attribute to an TensorFlow tensor proto +// Converts an MLIR elements attribute to a TensorFlow tensor proto // with the float_val field updated. Status ConvertFloatElementsAttr(const ElementsAttr attr, TensorProto* output_tensor) { @@ -305,7 +208,7 @@ Status ConvertFloatElementsAttr(const ElementsAttr attr, return Status::OK(); } -// Converts an MLIR elements attribute to an TensorFlow tensor proto +// Converts an MLIR elements attribute to a TensorFlow tensor proto // with the int_val field updated. Status ConvertIntElementsAttr(const mlir::ElementsAttr attr, TensorProto* output_tensor) { @@ -319,7 +222,7 @@ Status ConvertIntElementsAttr(const mlir::ElementsAttr attr, return Status::OK(); } -// Converts an MLIR elements attribute to an TensorFlow tensor proto +// Converts an MLIR elements attribute to a TensorFlow tensor proto // with the int64_val field updated. Status ConvertInt64ElementsAttr(const mlir::ElementsAttr attr, TensorProto* output_tensor) { @@ -333,7 +236,7 @@ Status ConvertInt64ElementsAttr(const mlir::ElementsAttr attr, return Status::OK(); } -// Converts an MLIR elements attribute to an TensorFlow tensor proto +// Converts an MLIR elements attribute to a TensorFlow tensor proto // with bool_val field updated. Status ConvertBoolElementsAttr(const mlir::ElementsAttr attr, TensorProto* output_tensor) { From c78501e3de57b8ee5af2be1c1646239596f3075b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 24 Jul 2019 21:47:44 -0700 Subject: [PATCH 0542/3053] Add an HloPrintOption that ignores unique IDs in names. PiperOrigin-RevId: 259880767 --- .../compiler/xla/service/hlo_computation.cc | 10 ++-- .../compiler/xla/service/hlo_computation.h | 2 +- .../compiler/xla/service/hlo_instruction.cc | 53 ++++++++++++------- .../compiler/xla/service/hlo_instruction.h | 29 +++++++++- .../compiler/xla/service/hlo_instructions.cc | 8 +-- tensorflow/compiler/xla/service/hlo_module.cc | 2 +- 6 files changed, 75 insertions(+), 29 deletions(-) diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc index 639e853ada7..5ce8b2b2613 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.cc +++ b/tensorflow/compiler/xla/service/hlo_computation.cc @@ -532,11 +532,12 @@ string HloComputation::ToString( if (options.print_percent()) { s << "%"; } - s << name() << " "; + s << PrintName(name(), options.print_ids()) << " "; } if (options.print_program_shape()) { - s << ShapeUtil::HumanString(ComputeProgramShape()) << " "; + s << ShapeUtil::HumanString(ComputeProgramShape(options.print_ids())) + << " "; } s << "{\n"; { @@ -753,12 +754,13 @@ StatusOr HloComputation::DeepCopyInstructionWithCustomCopier( return DeepCopyHelper(instruction, &index, copy_leaf); } -ProgramShape HloComputation::ComputeProgramShape() const { +ProgramShape HloComputation::ComputeProgramShape(bool include_ids) const { ProgramShape program_shape; for (auto* param_instruction : param_instructions_) { *program_shape.add_parameters() = param_instruction->shape(); - *program_shape.add_parameter_names() = param_instruction->name(); + *program_shape.add_parameter_names() = + PrintName(param_instruction->name(), include_ids); } *program_shape.mutable_result() = root_instruction_->shape(); diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h index 111b28a8610..bdbc92e375e 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.h +++ b/tensorflow/compiler/xla/service/hlo_computation.h @@ -288,7 +288,7 @@ class HloComputation { // Computes and returns the ProgramShape of this computation (shape of // parameters and result with layout). - ProgramShape ComputeProgramShape() const; + ProgramShape ComputeProgramShape(bool include_ids = true) const; // Return whether `*this` and `other` are functionally equivalent. bool Equal(const HloComputation& other, bool is_layout_sensitive) const; diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index ddfcdcfd293..f7d36fca7b7 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -2179,10 +2179,20 @@ string HloInstruction::SignatureString() const { return StrCat("(", operands, ") -> ", ShapeUtil::HumanString(shape())); } +string PrintName(const string& name, bool print_ids) { + if (print_ids) { + return name; + } else { + auto dot_position = name.find_first_of("."); + return name.substr(0, dot_position); + } +} + namespace { -string PrintName(const string& name, const HloPrintOptions& options) { - return StrCat(options.print_percent() ? "%" : "", name); +string PrintNameInternal(const string& name, const HloPrintOptions& options) { + return StrCat(options.print_percent() ? "%" : "", + PrintName(name, options.print_ids())); } } // namespace @@ -2277,11 +2287,12 @@ string HloInstruction::ToStringWithCanonicalNameMap( // If we are canonicalizing instruction names and this is a top-level // HloInstruction::ToString() call, don't print an instruction name. StrAppend(&result, - PrintName(canonical_name_map->LookupOrInsert(name()), options), + PrintNameInternal(canonical_name_map->LookupOrInsert(name()), + options), " = "); } } else { - StrAppend(&result, PrintName(name(), options), " = "); + StrAppend(&result, PrintNameInternal(name(), options), " = "); } // Print shape. @@ -2347,10 +2358,10 @@ string HloInstruction::OperandsToStringWithCanonicalNameMap( // part of the canonical string. if (options.canonicalize_instruction_names() && options.is_in_nested_computation()) { - str.push_back(PrintName( + str.push_back(PrintNameInternal( canonical_name_map->LookupOrInsert(operand->name()), options)); } else if (options.print_operand_names()) { - str.push_back(PrintName(operand->name(), options)); + str.push_back(PrintNameInternal(operand->name(), options)); } StrAppend(out, StrJoin(str, " ")); }); @@ -2368,27 +2379,30 @@ std::vector HloInstruction::ExtraAttributesToString( if (options.print_subcomputation_mode() == HloPrintOptions::PrintSubcomputationMode::kNameOnly) { if (opcode() == HloOpcode::kWhile) { + extra.push_back(StrCat( + "condition=", PrintNameInternal(while_condition()->name(), options))); extra.push_back( - StrCat("condition=", PrintName(while_condition()->name(), options))); - extra.push_back( - StrCat("body=", PrintName(while_body()->name(), options))); + StrCat("body=", PrintNameInternal(while_body()->name(), options))); } else if (opcode() == HloOpcode::kSelectAndScatter) { - extra.push_back(StrCat("select=", PrintName(select()->name(), options))); extra.push_back( - StrCat("scatter=", PrintName(scatter()->name(), options))); + StrCat("select=", PrintNameInternal(select()->name(), options))); + extra.push_back( + StrCat("scatter=", PrintNameInternal(scatter()->name(), options))); } else if (opcode() == HloOpcode::kConditional) { if (operand(0)->shape().element_type() == PRED) { - extra.push_back(StrCat("true_computation=", - PrintName(true_computation()->name(), options))); + extra.push_back( + StrCat("true_computation=", + PrintNameInternal(true_computation()->name(), options))); extra.push_back( StrCat("false_computation=", - PrintName(false_computation()->name(), options))); + PrintNameInternal(false_computation()->name(), options))); } else { extra.push_back(StrCat( "branch_computations={", StrJoin(branch_computations(), ", ", [&](string* out, const HloComputation* computation) { - StrAppend(out, PrintName(computation->name(), options)); + StrAppend( + out, PrintNameInternal(computation->name(), options)); }), "}")); } @@ -2399,13 +2413,14 @@ std::vector HloInstruction::ExtraAttributesToString( opcode() == HloOpcode::kScatter || opcode() == HloOpcode::kSort) { extra.push_back( - StrCat("to_apply=", PrintName(to_apply()->name(), options))); + StrCat("to_apply=", PrintNameInternal(to_apply()->name(), options))); } else if (!called_computations().empty()) { extra.push_back(StrCat( "calls=", StrJoin(called_computations(), ", ", [&](string* out, const HloComputation* computation) { - StrAppend(out, PrintName(computation->name(), options)); + StrAppend(out, + PrintNameInternal(computation->name(), options)); }))); } } else if (options.print_subcomputation_mode() == @@ -2473,8 +2488,8 @@ std::vector HloInstruction::ExtraAttributesToString( extra.push_back(StrCat("control-predecessors={", StrJoin(control_predecessors_, ", ", [&](string* out, HloInstruction* pre) { - StrAppend(out, - PrintName(pre->name(), options)); + StrAppend(out, PrintNameInternal( + pre->name(), options)); }), "}")); } diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index fbaeb5d5f66..78128a766b0 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -63,6 +63,8 @@ namespace xla { class HloComputation; class HloModule; +string PrintName(const string& name, bool print_ids); + // A bunch of switches that control how the hlo text should be printed. class HloPrintOptions { public: @@ -88,7 +90,8 @@ class HloPrintOptions { print_control_dependencies_(true), canonicalize_instruction_names_(false), indent_amount_(0), - is_in_nested_computation_(false) {} + is_in_nested_computation_(false), + print_ids_(true) {} static HloPrintOptions ShortParsable() { return HloPrintOptions() @@ -118,6 +121,22 @@ class HloPrintOptions { .set_canonicalize_instruction_names(true); } + // Options to produce a fingerprint of an HLO. + static HloPrintOptions Fingerprint() { + return HloPrintOptions() + .set_print_subcomputation_mode(PrintSubcomputationMode::kNameOnly) + .set_print_metadata(false) + .set_print_backend_config(false) + .set_compact_operands(true) + .set_print_operand_names(false) + .set_print_operand_shape(true) + .set_print_program_shape(false) + .set_print_percent(false) + .set_print_control_dependencies(false) + .set_canonicalize_instruction_names(true) + .set_print_ids(false); + } + // If true, large constants will be printed out. HloPrintOptions& set_print_large_constants(bool value) { print_large_constants_ = value; @@ -154,6 +173,12 @@ class HloPrintOptions { return *this; } + // If true, all printed names include unique identifiers. + HloPrintOptions& set_print_ids(bool value) { + print_ids_ = value; + return *this; + } + // If true, program shape of hlo computations will be printed. HloPrintOptions& set_print_program_shape(bool value) { print_program_shape_ = value; @@ -216,6 +241,7 @@ class HloPrintOptions { bool include_layout_in_shapes() const { return include_layout_in_shapes_; } bool print_operand_shape() const { return print_operand_shape_; } bool print_operand_names() const { return print_operand_names_; } + bool print_ids() const { return print_ids_; } bool print_program_shape() const { return print_program_shape_; } bool print_percent() const { return print_percent_; } bool print_control_dependencies() const { @@ -242,6 +268,7 @@ class HloPrintOptions { bool canonicalize_instruction_names_; int indent_amount_; bool is_in_nested_computation_; + bool print_ids_; }; // For canonical string output, we need to have a canonical way to rename diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc index 52d8c7a43ce..312dc1b1d62 100644 --- a/tensorflow/compiler/xla/service/hlo_instructions.cc +++ b/tensorflow/compiler/xla/service/hlo_instructions.cc @@ -1737,7 +1737,7 @@ HloInstructionProto HloParameterInstruction::ToProto() const { } std::vector HloParameterInstruction::ExtraAttributesToStringImpl( - const HloPrintOptions& /*options*/) const { + const HloPrintOptions& options) const { std::vector result; if (!parameter_replicated_at_leaf_buffers_) { return result; @@ -1746,8 +1746,10 @@ std::vector HloParameterInstruction::ExtraAttributesToStringImpl( for (bool replicated : *parameter_replicated_at_leaf_buffers_) { buffers_replicated_strs.push_back(replicated ? "true" : "false"); } - result.push_back(StrCat("parameter_replication={", - StrJoin(buffers_replicated_strs, ","), "}")); + if (options.print_ids()) { + result.push_back(StrCat("parameter_replication={", + StrJoin(buffers_replicated_strs, ","), "}")); + } return result; } diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc index fbef51c4ce6..508c7a1561b 100644 --- a/tensorflow/compiler/xla/service/hlo_module.cc +++ b/tensorflow/compiler/xla/service/hlo_module.cc @@ -215,7 +215,7 @@ void HloModule::ReplaceComputations( string HloModule::ToString(const HloPrintOptions& options) const { std::ostringstream s; - s << "HloModule " << name(); + s << "HloModule " << PrintName(name(), options.print_ids()); if (has_schedule()) { TF_CHECK_OK(schedule().Verify()); s << ", is_scheduled=true"; From 1d436a85fe50dba9ea8eec6675b7cbfa54941e84 Mon Sep 17 00:00:00 2001 From: Karmel Allison Date: Wed, 24 Jul 2019 22:10:12 -0700 Subject: [PATCH 0543/3053] Determine batch size when building the DataAdaptor for numpy + similar types. The code for the batch_size=0 case was already there, though we threw an error before ever reaching it. This removes the error and adds logic for handling steps. If no steps are passed AND no batch size is passed, we still throw an error. PiperOrigin-RevId: 259883052 --- .../python/keras/engine/data_adapter.py | 30 ++++++++------- .../python/keras/engine/data_adapter_test.py | 38 ++++++++++++++----- tensorflow/python/keras/engine/training_v2.py | 2 +- 3 files changed, 45 insertions(+), 25 deletions(-) diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py index a25ffe906ce..139fcd914c4 100644 --- a/tensorflow/python/keras/engine/data_adapter.py +++ b/tensorflow/python/keras/engine/data_adapter.py @@ -184,7 +184,7 @@ class TensorLikeDataAdapter(DataAdapter): return all(_is_tensor_or_composite(v) for v in flat_inputs) def __init__(self, x, y=None, sample_weights=None, batch_size=None, - shuffle=False, **kwargs): + steps=None, shuffle=False, **kwargs): super(TensorLikeDataAdapter, self).__init__(x, y, **kwargs) x = _process_numpy_inputs(x) y = _process_numpy_inputs(y) @@ -207,23 +207,25 @@ class TensorLikeDataAdapter(DataAdapter): else: inputs = (x,) - if not batch_size: - raise ValueError( - "`batch_size` is required for `Tensor` or `NumPy` input data.") - dataset = dataset_ops.DatasetV2.from_tensor_slices(inputs) num_samples = int(nest.flatten(x)[0].shape[0]) if shuffle: dataset = dataset.shuffle(num_samples) - if batch_size: - dataset = dataset.batch(batch_size) - self._size = int(math.ceil(num_samples / batch_size)) - self._batch_size = batch_size - self._has_partial_batch = (self._size != (num_samples // batch_size)) - else: - self._size = 1 - self._batch_size = num_samples - self._has_partial_batch = False + + # If batch_size is not passed but steps is, calculate from the input data. + if steps and not batch_size: + batch_size = int(math.ceil(num_samples/steps)) + + if not batch_size: + raise ValueError( + "`batch_size` or `steps` is required for `Tensor` or `NumPy`" + " input data.") + + dataset = dataset.batch(batch_size) + self._size = int(math.ceil(num_samples / batch_size)) + self._batch_size = batch_size + self._has_partial_batch = (self._size != (num_samples // batch_size)) + self._partial_batch_size = None if self._has_partial_batch: self._partial_batch_size = ( diff --git a/tensorflow/python/keras/engine/data_adapter_test.py b/tensorflow/python/keras/engine/data_adapter_test.py index 5564e6c02f9..8f5fe16acdc 100644 --- a/tensorflow/python/keras/engine/data_adapter_test.py +++ b/tensorflow/python/keras/engine/data_adapter_test.py @@ -18,6 +18,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python import keras @@ -31,7 +32,7 @@ from tensorflow.python.platform import test @test_util.run_all_in_graph_and_eager_modes -class DataAdapterTestBase(test.TestCase): +class DataAdapterTestBase(test.TestCase, parameterized.TestCase): def setUp(self): super(DataAdapterTestBase, self).setUp() @@ -83,7 +84,8 @@ class TensorLikeDataAdapterTest(DataAdapterTestBase): self.assertFalse(self.adapter_cls.can_handle(self.sequence_input)) def test_iterator_expect_batch_size_numpy(self): - with self.assertRaisesRegexp(ValueError, r'`batch_size` is required'): + with self.assertRaisesRegexp( + ValueError, r'`batch_size` or `steps` is required'): self.adapter_cls(self.numpy_input, self.numpy_target) def test_size_numpy(self): @@ -131,17 +133,33 @@ class TensorLikeDataAdapterTest(DataAdapterTestBase): self.assertEqual(adapter.get_size(), 10) self.assertFalse(adapter.has_partial_batch()) - def test_batch_size(self): + @parameterized.named_parameters( + ('batch_size_5', 5, None, 5), + ('batch_size_50', 50, 4, 50), # Sanity check: batch_size takes precedence + ('steps_1', None, 1, 50), + ('steps_4', None, 4, 13), + ) + def test_batch_size(self, batch_size_in, steps, batch_size_out): adapter = self.adapter_cls( - self.tensor_input, self.tensor_target, batch_size=5) - self.assertEqual(adapter.batch_size(), 5) + self.tensor_input, self.tensor_target, batch_size=batch_size_in, + steps=steps) + self.assertEqual(adapter.batch_size(), batch_size_out) - def test_partial_batch(self): + @parameterized.named_parameters( + ('batch_size_5', 5, None, 10, 0), + ('batch_size_4', 4, None, 13, 2), + ('steps_1', None, 1, 1, 0), + ('steps_5', None, 5, 5, 0), + ('steps_4', None, 4, 4, 11), + ) + def test_partial_batch( + self, batch_size_in, steps, size, partial_batch_size): adapter = self.adapter_cls( - self.tensor_input, self.tensor_target, batch_size=4) - self.assertEqual(adapter.get_size(), 13) # 50/4 - self.assertTrue(adapter.has_partial_batch()) - self.assertEqual(adapter.partial_batch_size(), 2) + self.tensor_input, self.tensor_target, batch_size=batch_size_in, + steps=steps) + self.assertEqual(adapter.get_size(), size) # 50/steps + self.assertEqual(adapter.has_partial_batch(), bool(partial_batch_size)) + self.assertEqual(adapter.partial_batch_size(), partial_batch_size or None) class DatasetAdapterTest(DataAdapterTestBase): diff --git a/tensorflow/python/keras/engine/training_v2.py b/tensorflow/python/keras/engine/training_v2.py index 5d098476800..2371d20684b 100644 --- a/tensorflow/python/keras/engine/training_v2.py +++ b/tensorflow/python/keras/engine/training_v2.py @@ -555,7 +555,7 @@ def _process_inputs(model, x, y, batch_size=None, sample_weights=None, # important which contains on-fly model build/tensor align for dict input, # etc. We should still call the _standardize_user_data with the peeked data # from generator or sequence, and let model compile. - return adapter_cls(x, y, batch_size=batch_size, + return adapter_cls(x, y, batch_size=batch_size, steps=steps, sample_weights=sample_weights, shuffle=shuffle, distribution_strategy=distribution_strategy) From 416942991e64169b1aa158d2a3d6d18d46362f2e Mon Sep 17 00:00:00 2001 From: Igor Saprykin Date: Wed, 24 Jul 2019 22:54:23 -0700 Subject: [PATCH 0544/3053] Make sure each saved model test is executed in its own directory. PiperOrigin-RevId: 259886583 --- .../python/distribute/saved_model_test_base.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/distribute/saved_model_test_base.py b/tensorflow/python/distribute/saved_model_test_base.py index c17c0e3ef49..245e258ffdb 100644 --- a/tensorflow/python/distribute/saved_model_test_base.py +++ b/tensorflow/python/distribute/saved_model_test_base.py @@ -19,6 +19,7 @@ from __future__ import division from __future__ import print_function import os +import tempfile from absl.testing import parameterized import numpy as np @@ -154,8 +155,8 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase): distribution, run_distributed): """Save a model without DS, and restore it with DS.""" - saved_dir = os.path.join(self.get_temp_dir(), self._root_dir, - 'test_save_no_dist_restore_dist') + saved_dir = os.path.join(tempfile.mkdtemp(dir=self.get_temp_dir()), + self._root_dir, 'test_save_no_dist_restore_dist') model, output_name = model_and_input.get_model( run_distributed=run_distributed) @@ -182,8 +183,8 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase): run_distributed): """Save a model with DS, and restore it without DS.""" - saved_dir = os.path.join(self.get_temp_dir(), self._root_dir, - 'test_save_no_dist_restore_dist') + saved_dir = os.path.join(tempfile.mkdtemp(dir=self.get_temp_dir()), + self._root_dir, 'test_save_no_dist_restore_dist') with distribution.scope(): model, output_name = model_and_input.get_model( @@ -215,8 +216,8 @@ class TestSavedModelBase(test.TestCase, parameterized.TestCase): save_in_scope, run_distributed): """Save a model with DS, and restore it with potentially different DS.""" - saved_dir = os.path.join(self.get_temp_dir(), self._root_dir, - 'test_save_dist_restore_dist') + saved_dir = os.path.join(tempfile.mkdtemp(dir=self.get_temp_dir()), + self._root_dir, 'test_save_dist_restore_dist') with distribution_for_saving.scope(): model, output_name = model_and_input.get_model( From 6eb772aed296a82e31772637806e84ee5df6b8ee Mon Sep 17 00:00:00 2001 From: Yongfeng Gu Date: Thu, 25 Jul 2019 02:18:31 -0400 Subject: [PATCH 0545/3053] Revert "Opt out DEVICE_GPU_XLA_JIT and DEVICE_XLA_GPU from ResizeNearestNeighborOp, ResizeBilinearOp, and ResizeBilinearGradOp, because the dilation-based approach may introduce convolutions too large for GPU to handle." This reverts commit 4c5910487ebdd30f29e4ac4741f884e09d63d23e. --- .../tf2xla/kernels/image_resize_ops.cc | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc index 04a37a433b4..b309541a864 100644 --- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc @@ -587,13 +587,7 @@ void ResizeNearestNeighborOp::Compile(XlaOpKernelContext* ctx) { GeneralCompile(ctx, align_corners_, is_kernel_bilinear_); } -REGISTER_XLA_OP(Name("ResizeNearestNeighbor") - .Device(DEVICE_CPU_XLA_JIT) - .CompileTimeConstantInput("size"), - ResizeNearestNeighborOp); -REGISTER_XLA_OP(Name("ResizeNearestNeighbor") - .Device(DEVICE_XLA_CPU) - .CompileTimeConstantInput("size"), +REGISTER_XLA_OP(Name("ResizeNearestNeighbor").CompileTimeConstantInput("size"), ResizeNearestNeighborOp); ResizeBilinearOp::ResizeBilinearOp(OpKernelConstruction* ctx) @@ -610,13 +604,7 @@ void ResizeBilinearOp::Compile(XlaOpKernelContext* ctx) { GeneralCompile(ctx, align_corners_, is_kernel_bilinear_); } -REGISTER_XLA_OP(Name("ResizeBilinear") - .Device(DEVICE_CPU_XLA_JIT) - .CompileTimeConstantInput("size"), - ResizeBilinearOp); -REGISTER_XLA_OP(Name("ResizeBilinear") - .Device(DEVICE_XLA_CPU) - .CompileTimeConstantInput("size"), +REGISTER_XLA_OP(Name("ResizeBilinear").CompileTimeConstantInput("size"), ResizeBilinearOp); ResizeBilinearGradOp::ResizeBilinearGradOp(OpKernelConstruction* ctx) @@ -710,7 +698,6 @@ void ResizeBilinearGradOp::Compile(XlaOpKernelContext* ctx) { ctx->SetOutput(0, output); } -REGISTER_XLA_OP(Name("ResizeBilinearGrad").Device(DEVICE_CPU_XLA_JIT), ResizeBilinearGradOp); -REGISTER_XLA_OP(Name("ResizeBilinearGrad").Device(DEVICE_XLA_CPU), ResizeBilinearGradOp); +REGISTER_XLA_OP(Name("ResizeBilinearGrad"), ResizeBilinearGradOp); } // namespace tensorflow From 559531b8dd39ef95fb003989a09bf29989923252 Mon Sep 17 00:00:00 2001 From: Yongfeng Gu Date: Thu, 25 Jul 2019 02:24:21 -0400 Subject: [PATCH 0546/3053] Add ResizeNearestNeighborOp, ResizeBilinearOp, and ResizeBilinearGradOp to the OpIsSlow list, because the dilation-based approach may introduce convolutions too large for GPU to handle for certain sizes. Disallowing slow Ops can opt them out of XLA. --- tensorflow/compiler/jit/compilability_check_util.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/jit/compilability_check_util.cc b/tensorflow/compiler/jit/compilability_check_util.cc index 5e3b93d30e5..aa526d8fabf 100644 --- a/tensorflow/compiler/jit/compilability_check_util.cc +++ b/tensorflow/compiler/jit/compilability_check_util.cc @@ -265,7 +265,10 @@ bool RecursiveCompilabilityChecker::OpIsSlow(const Node& node) const { // b/135640736: MatrixInverse performance issues. return node.type_string() == "SelfAdjointEigV2" || node.type_string() == "Svd" || node.type_string() == "Qr" || - node.type_string() == "MatrixInverse"; + node.type_string() == "MatrixInverse" || + node.type_string() == "ResizeNearestNeighbor" || + node.type_string() == "ResizeBilinear" || + node.type_string() == "ResizeBilinearGrad"; } bool RecursiveCompilabilityChecker::IsCompilableNode( From 24b3e6cf73f3bcfadac0e04a88f10fe5cb4556cc Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 25 Jul 2019 00:19:40 -0700 Subject: [PATCH 0547/3053] No public changes. PiperOrigin-RevId: 259894490 --- tensorflow/contrib/layers/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD index 8e410006c16..e3bc372910f 100644 --- a/tensorflow/contrib/layers/BUILD +++ b/tensorflow/contrib/layers/BUILD @@ -77,6 +77,7 @@ tf_custom_op_py_library( srcs_version = "PY2AND3", visibility = [ "//learning/brain:__subpackages__", + "//learning/lib/ami/simple_ml/link_other_ml_tools/tensorflow:__subpackages__", "//tensorflow:__subpackages__", "//tensorflow_model_optimization:__subpackages__", "//third_party/py/tf_slim:__subpackages__", From 87e03a53f3f6b0477502a5f501a3ceea1e8f43e0 Mon Sep 17 00:00:00 2001 From: Gunhan Gulsoy Date: Thu, 25 Jul 2019 00:46:27 -0700 Subject: [PATCH 0548/3053] Simplify the recursive tmeplate in TypedKernel PiperOrigin-RevId: 259897561 --- tensorflow/stream_executor/kernel.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tensorflow/stream_executor/kernel.h b/tensorflow/stream_executor/kernel.h index 9384db68582..1e4f375073e 100644 --- a/tensorflow/stream_executor/kernel.h +++ b/tensorflow/stream_executor/kernel.h @@ -525,16 +525,19 @@ class TypedKernel : public KernelBase { // structure. void PackParams(KernelArgsArray *args, Params &... params) const { - PackOneParam(args, params...); + PackOneParamFromList(args, params...); } template - void PackOneParam(KernelArgsArray *args, const T &arg, - const RestOfParams &... rest) const { + void PackOneParamFromList(KernelArgsArray *args, + const T &arg, const RestOfParams &... rest) const { PackOneParam(args, arg); - PackOneParam(args, rest...); + PackOneParamFromList(args, rest...); } + // Base case for variadic template expansion - nothing to do! + void PackOneParamFromList(KernelArgsArray *args) const {} + // Packs one (non-DeviceMemoryBase) parameter into the arg and sizes array. // The enable_if<> is for excluding DeviceMemoryBase args, which have a // separate implementation below. @@ -581,9 +584,6 @@ class TypedKernel : public KernelBase { args->add_shared_bytes(arg.size()); } - // Base case for variadic template expansion - nothing to do! - void PackOneParam(KernelArgsArray *args) const {} - SE_DISALLOW_COPY_AND_ASSIGN(TypedKernel); }; From 19c39157c0ac76545ae82bf48d2e11784ff232fb Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 25 Jul 2019 02:02:19 -0700 Subject: [PATCH 0549/3053] compat: Update forward compatibility horizon to 2019-07-25 PiperOrigin-RevId: 259906647 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index 0c980024549..18a51e6d92e 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export # This value changes every day with an automatic CL. It can be modified in code # via `forward_compatibility_horizon()` or with the environment variable # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date. -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 7, 24) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 7, 25) _FORWARD_COMPATIBILITY_HORIZON_OVERRIDDEN = False _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS" From 02a4290cbd2a2234c7b65d5fc89c060096aaa74b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 25 Jul 2019 02:02:20 -0700 Subject: [PATCH 0550/3053] Update GraphDef version to 107. PiperOrigin-RevId: 259906652 --- tensorflow/core/public/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h index 304eef492c6..a854d9056e1 100644 --- a/tensorflow/core/public/version.h +++ b/tensorflow/core/public/version.h @@ -108,7 +108,7 @@ limitations under the License. #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0 -#define TF_GRAPH_DEF_VERSION 106 // Updated: 2019/7/24 +#define TF_GRAPH_DEF_VERSION 107 // Updated: 2019/7/25 // Checkpoint compatibility versions (the versions field in SavedSliceMeta). // From 5518980ae5afa1591c2e55bb4fefb7591910b2de Mon Sep 17 00:00:00 2001 From: Chao Mei Date: Thu, 25 Jul 2019 02:34:38 -0700 Subject: [PATCH 0551/3053] Create a helper lib for some utility functions in the benchmark tool. PiperOrigin-RevId: 259910553 --- tensorflow/lite/tools/benchmark/BUILD | 25 ++++++ .../lite/tools/benchmark/benchmark_model.cc | 17 +--- .../tools/benchmark/benchmark_tflite_model.cc | 33 +------- .../lite/tools/benchmark/benchmark_utils.cc | 37 +++++++++ .../lite/tools/benchmark/benchmark_utils.h | 52 ++++++++++++ .../tools/benchmark/benchmark_utils_test.cc | 80 +++++++++++++++++++ 6 files changed, 200 insertions(+), 44 deletions(-) create mode 100644 tensorflow/lite/tools/benchmark/benchmark_utils.cc create mode 100644 tensorflow/lite/tools/benchmark/benchmark_utils.h create mode 100644 tensorflow/lite/tools/benchmark/benchmark_utils_test.cc diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD index 69e8fc6b2ce..461acf0735d 100644 --- a/tensorflow/lite/tools/benchmark/BUILD +++ b/tensorflow/lite/tools/benchmark/BUILD @@ -87,6 +87,7 @@ cc_library( copts = common_copts, deps = [ ":benchmark_model_lib", + ":benchmark_utils", ":logging", "//tensorflow/lite:framework", "//tensorflow/lite:string_util", @@ -117,6 +118,7 @@ cc_library( copts = common_copts, deps = [ ":benchmark_params", + ":benchmark_utils", ":logging", "//tensorflow/core:stats_calculator_portable", "//tensorflow/lite:framework", @@ -125,4 +127,27 @@ cc_library( ], ) +cc_library( + name = "benchmark_utils", + srcs = [ + "benchmark_utils.cc", + ], + hdrs = ["benchmark_utils.h"], + copts = common_copts, + deps = ["//tensorflow/lite/profiling:time"], +) + +cc_test( + name = "benchmark_utils_test", + srcs = [ + "benchmark_utils_test.cc", + ], + copts = common_copts, + deps = [ + ":benchmark_utils", + "//tensorflow/lite/profiling:time", + "@com_google_googletest//:gtest_main", + ], +) + tflite_portable_test_suite() diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.cc b/tensorflow/lite/tools/benchmark/benchmark_model.cc index 3ee5500ef7a..488dc506dd3 100644 --- a/tensorflow/lite/tools/benchmark/benchmark_model.cc +++ b/tensorflow/lite/tools/benchmark/benchmark_model.cc @@ -19,22 +19,9 @@ limitations under the License. #include #include "tensorflow/lite/profiling/time.h" +#include "tensorflow/lite/tools/benchmark/benchmark_utils.h" #include "tensorflow/lite/tools/benchmark/logging.h" -namespace { -void SleepForSeconds(double sleep_seconds) { - if (sleep_seconds <= 0.0) { - return; - } - // If requested, sleep between runs for an arbitrary amount of time. - // This can be helpful to determine the effect of mobile processor - // scaling and thermal throttling. - return tflite::profiling::time::SleepForMicros( - static_cast(sleep_seconds * 1e6)); -} - -} // namespace - namespace tflite { namespace benchmark { using tensorflow::Stat; @@ -143,7 +130,7 @@ Stat BenchmarkModel::Run(int min_num_times, float min_secs, listeners_.OnSingleRunEnd(); run_stats.UpdateStat(end_us - start_us); - SleepForSeconds(params_.Get("run_delay")); + util::SleepForSeconds(params_.Get("run_delay")); now_us = profiling::time::NowMicros(); } diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc index 0035a0b4373..b58e529c78a 100644 --- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc +++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc @@ -29,6 +29,7 @@ limitations under the License. #include "tensorflow/lite/profiling/buffered_profiler.h" #include "tensorflow/lite/profiling/profile_summarizer.h" #include "tensorflow/lite/string_util.h" +#include "tensorflow/lite/tools/benchmark/benchmark_utils.h" #include "tensorflow/lite/tools/benchmark/logging.h" #include "tensorflow/lite/tools/evaluation/utils.h" @@ -119,39 +120,13 @@ void GemmlowpProfilingListener::OnBenchmarkEnd( } std::vector Split(const std::string& str, const char delim) { - std::istringstream input(str); std::vector results; - std::string item; - while (std::getline(input, item, delim)) { - results.push_back(item); + if (!util::SplitAndParse(str, delim, &results)) { + results.clear(); } return results; } -template -bool SplitAndParse(const std::string& str, char delim, std::vector* values) { - std::istringstream input(str); - bool first = true; - while (!input.eof()) { - if (!first) { - char c; - input >> c; - if (c != delim) { - return false; - } - } else { - first = false; - } - T val; - input >> val; - if (!input.eof() && !input.good()) { - return false; - } - values->push_back(val); - } - return true; -} - template void FillRandomValue(T* ptr, int num_elements, const std::function& random_func) { @@ -197,7 +172,7 @@ bool PopulateInputLayerInfo( input.name = names[i]; - TFLITE_BENCHMARK_CHECK(SplitAndParse(shapes[i], ',', &input.shape)) + TFLITE_BENCHMARK_CHECK(util::SplitAndParse(shapes[i], ',', &input.shape)) << "Incorrect size string specified: " << shapes[i]; for (int dim : input.shape) { if (dim == -1) { diff --git a/tensorflow/lite/tools/benchmark/benchmark_utils.cc b/tensorflow/lite/tools/benchmark/benchmark_utils.cc new file mode 100644 index 00000000000..d8fe2633307 --- /dev/null +++ b/tensorflow/lite/tools/benchmark/benchmark_utils.cc @@ -0,0 +1,37 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/tools/benchmark/benchmark_utils.h" + +#include "tensorflow/lite/profiling/time.h" + +namespace tflite { +namespace benchmark { +namespace util { + +void SleepForSeconds(double sleep_seconds) { + if (sleep_seconds <= 0.0) { + return; + } + // If requested, sleep between runs for an arbitrary amount of time. + // This can be helpful to determine the effect of mobile processor + // scaling and thermal throttling. + tflite::profiling::time::SleepForMicros( + static_cast(sleep_seconds * 1e6)); +} + +} // namespace util +} // namespace benchmark +} // namespace tflite diff --git a/tensorflow/lite/tools/benchmark/benchmark_utils.h b/tensorflow/lite/tools/benchmark/benchmark_utils.h new file mode 100644 index 00000000000..b69011626d0 --- /dev/null +++ b/tensorflow/lite/tools/benchmark/benchmark_utils.h @@ -0,0 +1,52 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_UTILS_H_ +#define TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_UTILS_H_ + +#include +#include +#include + +namespace tflite { +namespace benchmark { +namespace util { + +// A convenient function that wraps tflite::profiling::time::SleepForMicros and +// simply return if 'sleep_seconds' is negative. +void SleepForSeconds(double sleep_seconds); + +// Split the 'str' according to 'delim', and store each splitted element into +// 'values'. +template +bool SplitAndParse(const std::string& str, char delim, std::vector* values) { + std::istringstream input(str); + for (std::string line; std::getline(input, line, delim);) { + std::istringstream to_parse(line); + T val; + to_parse >> val; + if (!to_parse.eof() && !to_parse.good()) { + return false; + } + values->emplace_back(val); + } + return true; +} + +} // namespace util +} // namespace benchmark +} // namespace tflite + +#endif // TENSORFLOW_LITE_TOOLS_BENCHMARK_BENCHMARK_UTILS_H_ diff --git a/tensorflow/lite/tools/benchmark/benchmark_utils_test.cc b/tensorflow/lite/tools/benchmark/benchmark_utils_test.cc new file mode 100644 index 00000000000..cb1517293f7 --- /dev/null +++ b/tensorflow/lite/tools/benchmark/benchmark_utils_test.cc @@ -0,0 +1,80 @@ +/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/lite/tools/benchmark/benchmark_utils.h" + +#include +#include + +#include +#include +#include "tensorflow/lite/profiling/time.h" + +namespace tflite { +namespace benchmark { +namespace { + +TEST(BenchmarkHelpersTest, SleepForNegativeSeconds) { + const auto start_ts = tflite::profiling::time::NowMicros(); + // The following should return immediately. + util::SleepForSeconds(-5.0); + const auto end_ts = tflite::profiling::time::NowMicros(); + + // As we don't have a mocked clock, we simply expect <1 sec has elapsed, which + // is admittedly not quite accurate. + EXPECT_LT(end_ts - start_ts, 1000000); +} + +TEST(BenchmarkHelpersTest, SleepForSomeSeconds) { + const auto start_ts = tflite::profiling::time::NowMicros(); + // The following should return after 2.0 secs + util::SleepForSeconds(2.0); + const auto end_ts = tflite::profiling::time::NowMicros(); + + // As we don't have a mocked clock, we simply expect >1.9 sec has elapsed. + EXPECT_GT(end_ts - start_ts, 1900000); +} + +TEST(BenchmarkHelpersTest, SplitAndParseFailed) { + std::vector results; + const bool splitted = util::SplitAndParse("hello;world", ';', &results); + + EXPECT_FALSE(splitted); +} + +TEST(BenchmarkHelpersTest, SplitAndParseString) { + std::vector results; + const bool splitted = util::SplitAndParse("hello,world", ',', &results); + + EXPECT_TRUE(splitted); + EXPECT_EQ(2, results.size()); + + EXPECT_EQ("hello", results[0]); + EXPECT_EQ("world", results[1]); +} + +TEST(BenchmarkHelpersTest, SplitAndParseInts) { + std::vector results; + const bool splitted = util::SplitAndParse("1,2", ',', &results); + + EXPECT_TRUE(splitted); + EXPECT_EQ(2, results.size()); + + EXPECT_EQ(1, results[0]); + EXPECT_EQ(2, results[1]); +} + +} // namespace +} // namespace benchmark +} // namespace tflite From f3a798279463d6a00116ac4b332c570fe54377f4 Mon Sep 17 00:00:00 2001 From: Tom Hennigan Date: Thu, 25 Jul 2019 03:06:19 -0700 Subject: [PATCH 0552/3053] Automated rollback of commit d2ecf4da67316061a312c1e60305d15b6133be65. Revert #29987. PiperOrigin-RevId: 259914773 --- tensorflow/python/kernel_tests/init_ops_test.py | 7 ------- tensorflow/python/ops/math_ops.py | 17 +++-------------- 2 files changed, 3 insertions(+), 21 deletions(-) diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py index 1d935ee8123..4b9681afd2c 100644 --- a/tensorflow/python/kernel_tests/init_ops_test.py +++ b/tensorflow/python/kernel_tests/init_ops_test.py @@ -537,13 +537,6 @@ class RangeTest(test.TestCase): math_ops.range( 0, 0, 1, dtype=dtypes.float64).dtype, dtypes.float64) - def testMixedDType(self): - # Test case for GitHub issue 29867 - with self.cached_session(use_gpu=True): - tf_ans = math_ops.range(constant_op.constant(5), dtype=dtypes.float32) - self.assertAllEqual( - self.evaluate(tf_ans), np.arange(np.int32(5), dtype=np.float32)) - # TODO(vrv): move to sequence_ops_test? class LinSpaceTest(test.TestCase): diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index eb9d4407641..9becce79cb1 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -1349,20 +1349,9 @@ def range(start, limit=None, delta=1, dtype=None, name="range"): # pylint: disa start, limit = 0, start with ops.name_scope(name, "Range", [start, limit, delta]) as name: - # In case dtype is not none, cast start, limit, and delta directly. - # Otherwise pass to convert_to_tensor. This is to handle - # the situation with: - # tf.range(tf.constant(5), dtype=tf.float32) - # which is comparable with: - # np.arange(np.int(5), dtype=np.float32) - if dtype is not None: - start = cast(start, dtype=dtype, name="start") - limit = cast(limit, dtype=dtype, name="limit") - delta = cast(delta, dtype=dtype, name="delta") - else: - start = ops.convert_to_tensor(start, name="start") - limit = ops.convert_to_tensor(limit, name="limit") - delta = ops.convert_to_tensor(delta, name="delta") + start = ops.convert_to_tensor(start, dtype=dtype, name="start") + limit = ops.convert_to_tensor(limit, dtype=dtype, name="limit") + delta = ops.convert_to_tensor(delta, dtype=dtype, name="delta") # infer dtype if not explicitly provided if dtype is None: From 1ffdcbe96ae75645cccbe41cfe711e7e81f1e060 Mon Sep 17 00:00:00 2001 From: Stefano Galarraga Date: Thu, 25 Jul 2019 03:07:52 -0700 Subject: [PATCH 0553/3053] Add delegate support for QUANTIZED_16BIT_LSTM PiperOrigin-RevId: 259914993 --- tensorflow/lite/delegates/nnapi/BUILD | 20 + .../lite/delegates/nnapi/nnapi_delegate.cc | 398 +++++++++++++++++- .../lite/delegates/nnapi/quant_lstm_sup.cc | 153 +++++++ .../lite/delegates/nnapi/quant_lstm_sup.h | 58 +++ .../delegates/nnapi/quant_lstm_sup_test.cc | 344 +++++++++++++++ tensorflow/lite/kernels/BUILD | 15 + tensorflow/lite/kernels/kernel_util.h | 10 +- .../lite/kernels/quant_basic_lstm_test.cc | 230 ++++++++++ tensorflow/lite/nnapi/NeuralNetworksTypes.h | 2 + tensorflow/lite/tools/make/Makefile | 1 + 10 files changed, 1214 insertions(+), 17 deletions(-) create mode 100644 tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc create mode 100644 tensorflow/lite/delegates/nnapi/quant_lstm_sup.h create mode 100644 tensorflow/lite/delegates/nnapi/quant_lstm_sup_test.cc create mode 100644 tensorflow/lite/kernels/quant_basic_lstm_test.cc diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD index 7cd5d146a13..f8439da7087 100644 --- a/tensorflow/lite/delegates/nnapi/BUILD +++ b/tensorflow/lite/delegates/nnapi/BUILD @@ -18,6 +18,8 @@ cc_library( ], "//conditions:default": [ "nnapi_delegate.cc", + "quant_lstm_sup.h", + "quant_lstm_sup.cc", ], }), hdrs = ["nnapi_delegate.h"], @@ -51,4 +53,22 @@ cc_test( ], ) +cc_test( + name = "quant_lstm_sup_test", + size = "small", + srcs = [ + "quant_lstm_sup.cc", + "quant_lstm_sup.h", + "quant_lstm_sup_test.cc", + ], + deps = [ + ":nnapi_delegate", + "//tensorflow/lite:framework", + "//tensorflow/lite/c:c_api_internal", + "//tensorflow/lite/kernels:kernel_util", + "//tensorflow/lite/testing:util", + "@com_google_googletest//:gtest", + ], +) + tflite_portable_test_suite() diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc index 87c89dde4fc..2e965b08652 100644 --- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc +++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc @@ -19,10 +19,12 @@ limitations under the License. #include #include #include +#include #include #include #include #include +#include #include #include "tensorflow/lite/allocation.h" @@ -31,6 +33,7 @@ limitations under the License. #include "tensorflow/lite/c/builtin_op_data.h" #include "tensorflow/lite/c/c_api_internal.h" #include "tensorflow/lite/context_util.h" +#include "tensorflow/lite/delegates/nnapi/quant_lstm_sup.h" #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/minimal_logging.h" #include "tensorflow/lite/nnapi/nnapi_implementation.h" @@ -154,6 +157,22 @@ bool NeedInt8Conversion(const TfLiteContext* context, int builtin_code, } } +constexpr int kLstmFullKernelInputSize = 24; +// The 20 input version is deprecated and kept only to +// support old model. The latest version of the LSTM Full Kernel +// is the one with 24 inputs +constexpr int kLstmFullKernelNoOptionalParamsInputSize = 20; +constexpr int kLstmBasicKernelInputSize = 5; + +inline bool isLstmBasicKernel(const TfLiteNode* node) { + return node->inputs->size == kLstmBasicKernelInputSize; +} + +inline bool isLstmFullKernel(const TfLiteNode* node) { + return node->inputs->size == kLstmFullKernelInputSize || + node->inputs->size == kLstmFullKernelNoOptionalParamsInputSize; +} + bool IsHybridOperator(const TfLiteContext* context, int builtin_code, const TfLiteNode* node) { switch (builtin_code) { @@ -165,7 +184,15 @@ bool IsHybridOperator(const TfLiteContext* context, int builtin_code, const TfLiteType filter_type = context->tensors[filter_id].type; return IsFloat(input_type) && IsQuantized(filter_type); } - case kTfLiteBuiltinLstm: + case kTfLiteBuiltinLstm: { + const int input_id = node->inputs->data[0]; + // Input #1 is optional so use #2 to determine if hybrid. + const int weights_id = node->inputs->data[2]; + const TfLiteType input_type = context->tensors[input_id].type; + const TfLiteType weights_type = context->tensors[weights_id].type; + return isLstmFullKernel(node) && IsFloat(input_type) && + IsQuantized(weights_type); + } case kTfLiteBuiltinUnidirectionalSequenceLstm: { const int input_id = node->inputs->data[0]; // Input #1 is optional so use #2 to determine if hybrid. @@ -356,6 +383,13 @@ class OperandMapping { // be mapped. int add_new_non_tensor_operand() { return next_ann_tensor_index_++; } + // This call is necessary for input operands generated by the delegate + // to map constant inputs not present in TFLite but required by NNAPI, + // for example when splitting one input in several ones. + int add_delegate_generated_input_ann_tensors_operand() { + return next_ann_tensor_index_++; + } + // Add a new mapping from `tflite_index` and return the NN API tensor index. int add_new_ann_tensor_index(int tflite_index) { if (tflite_index >= lite_tensor_to_ann_tensor_.size()) { @@ -581,6 +615,66 @@ class NNAPIOpBuilder { return kTfLiteOk; } + template + TfLiteStatus AddNewInputConstantTensor( + int32_t nn_type, TfLiteType type, const TfLiteIntArray* dims, + const std::function& init_fn, + const TfLiteQuantizationParams& quant_params, int* tensor_index) { + TF_LITE_ENSURE_OK(context_, + context_->AddTensors(context_, 1, tensor_index)); + + TfLiteTensor* new_tensor = &context_->tensors[*tensor_index]; + new_tensor->type = type; + new_tensor->allocation_type = kTfLiteDynamic; + new_tensor->params = quant_params; + + // Not removing the new tensor in case of resizing errors since it will + // be cleared by the context + TF_LITE_ENSURE_OK( + context_, + context_->ResizeTensor( + context_, new_tensor, + // Resize Tensor takes ownership of the dims array passed as param + TfLiteIntArrayCopy(dims))); + + const int64_t out_size = NumElements(dims); + TF_LITE_ENSURE_OK(context_, init_fn(new_tensor->data, out_size)); + + const uint32_t tensor_rank = static_cast(dims->size); + const uint32_t* tensor_dims = reinterpret_cast(dims->data); + ANeuralNetworksOperandType operand_type{nn_type, tensor_rank, tensor_dims, + quant_params.scale, + quant_params.zero_point}; + + const int ann_tensor_index = + operand_mapping_->add_delegate_generated_input_ann_tensors_operand(); + + RETURN_TFLITE_ERROR_IF_NN_ERROR( + context_, + nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type)); + + augmented_inputs_.push_back(ann_tensor_index); + + RETURN_TFLITE_ERROR_IF_NN_ERROR( + context_, nnapi_->ANeuralNetworksModel_setOperandValue( + nn_model_, ann_tensor_index, new_tensor->data.raw, + new_tensor->bytes)); + + return kTfLiteOk; + } + + template + TfLiteStatus AddNewInputConstantTensor( + int32_t nn_type, TfLiteType type, std::initializer_list dims, + const std::function& init_fn, + const TfLiteQuantizationParams& quant_params, int* tensor_index) { + TfLiteIntArray* dim_array = TfLiteIntArrayCreate(dims.size()); + const auto result = AddNewInputConstantTensor( + nn_type, type, dim_array, init_fn, quant_params, tensor_index); + TfLiteIntArrayFree(dim_array); + return result; + } + private: // Returns a TF Lite type which has the same memory representation as a // provided NN API type. @@ -716,6 +810,11 @@ class NNAPIOpBuilder { case kTfLiteBool: nn_type = ANEURALNETWORKS_TENSOR_BOOL8; break; + case kTfLiteInt16: + nn_type = ANEURALNETWORKS_TENSOR_QUANT16_SYMM; + scale = tensor->params.scale; + zeroPoint = tensor->params.zero_point; + break; default: context_->ReportError( context_, "Failed to add NN API tensor: type %s is not supported.", @@ -839,6 +938,7 @@ struct NNAPIOpMappingArgs { TfLiteNode* node; std::vector* model_state_outputs; std::vector* model_state_tfl_inputs; + std::vector>* feedback_loops; }; // Mapping function simply returning the operation type without adding any @@ -1665,20 +1765,246 @@ class NNAPIDelegateKernel { // Hybrid operators not supported before NNAPI 1.2. return nullptr; } - // TODO(levp): name the constants for number of inputs in LSTM kernel. - if (node->inputs->size != 20 && node->inputs->size != 24) { - return nullptr; + + const auto weight_input_index = + isLstmBasicKernel(node) + ? 2 /* basic::kInputWeights */ + : 4 /* full::kInputToOutputWeightsTensor */; + + const TfLiteType weight_type = + context->tensors[node->inputs->data[weight_input_index]].type; + + if (isLstmBasicKernel(node)) { + if (weight_type != kTfLiteUInt8) { + return nullptr; + } + const auto input_quantization_params = + context->tensors[node->inputs->data[0]].params; + if (input_quantization_params.scale != 1. / 128. || + input_quantization_params.zero_point != 128) { + return nullptr; + } + + const auto output_quantization_params = + context->tensors[node->outputs->data[0]].params; + if (output_quantization_params.scale != 1. / 128. || + output_quantization_params.zero_point != 128) { + return nullptr; + } + + const auto cell_state_quantization_params = + context->tensors[node->outputs->data[1]].params; + if (cell_state_quantization_params.scale != 16. / 32768. || + cell_state_quantization_params.zero_point != 0) { + return nullptr; + } + + auto is_const_tensor = [&node, &context](int tensor_idx) { + return context->tensors[node->inputs->data[tensor_idx]] + .allocation_type == kTfLiteMmapRo; + }; + + if (!is_const_tensor(2 /* kInputWeights */)) { + return nullptr; + } + + if (!is_const_tensor(3 /* kInputBiases */)) { + return nullptr; + } + + return [](const NNAPIOpMappingArgs& mapping_args) + -> ANeuralNetworksOperationType { + const auto output_dims = + mapping_args.context + ->tensors[mapping_args.node->outputs->data[1]] + .dims; + + // Inputs kInputData + mapping_args.builder->AddTensorInput( + mapping_args.node->inputs->data[0 /* kInputData */], + /* hybrid_op */ false, + /* scalar_as_tensor */ false); + + // The 8 weights tensors are set decomposing the + // kInputWeights param + const auto weight_tensor = + mapping_args.context->tensors + [mapping_args.node->inputs->data[2 /* kInputWeights */]]; + + std::vector recurrent_to_input; + std::vector input_to_input; + std::vector recurrent_to_cell; + std::vector input_to_cell; + std::vector recurrent_to_forget; + std::vector input_to_forget; + std::vector recurrent_to_output; + std::vector input_to_output; + tflite::delegate::nnapi::DecomposeQuantLstmWeightsTensor( + weight_tensor.data.uint8, weight_tensor.dims, + &recurrent_to_input, &input_to_input, &recurrent_to_cell, + &input_to_cell, &recurrent_to_forget, &input_to_forget, + &recurrent_to_output, &input_to_output); + + const auto ui8_fill_with = + [](const std::vector& read_from, + TfLitePtrUnion write_to, int64_t size) -> TfLiteStatus { + std::copy(read_from.begin(), read_from.end(), write_to.uint8); + return kTfLiteOk; + }; + + TfLiteIntArray* recurrent_weight_dims = TfLiteIntArrayCreate(2); + TfLiteIntArray* input_weight_dims = TfLiteIntArrayCreate(2); + tflite::delegate::nnapi::SetWeightSubmatrixDims( + weight_tensor.dims, recurrent_weight_dims, input_weight_dims); + + int new_tensor_index = -1; + + mapping_args.builder->AddNewInputConstantTensor( + ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8, + input_weight_dims, + std::bind(ui8_fill_with, input_to_input, + std::placeholders::_1, std::placeholders::_2), + weight_tensor.params, &new_tensor_index); + + mapping_args.builder->AddNewInputConstantTensor( + ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8, + input_weight_dims, + std::bind(ui8_fill_with, input_to_forget, + std::placeholders::_1, std::placeholders::_2), + weight_tensor.params, &new_tensor_index); + + mapping_args.builder->AddNewInputConstantTensor( + ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8, + input_weight_dims, + std::bind(ui8_fill_with, input_to_cell, std::placeholders::_1, + std::placeholders::_2), + weight_tensor.params, &new_tensor_index); + + mapping_args.builder->AddNewInputConstantTensor( + ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8, + input_weight_dims, + std::bind(ui8_fill_with, input_to_output, + std::placeholders::_1, std::placeholders::_2), + weight_tensor.params, &new_tensor_index); + + mapping_args.builder->AddNewInputConstantTensor( + ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8, + recurrent_weight_dims, + std::bind(ui8_fill_with, recurrent_to_input, + std::placeholders::_1, std::placeholders::_2), + weight_tensor.params, &new_tensor_index); + + mapping_args.builder->AddNewInputConstantTensor( + ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8, + recurrent_weight_dims, + std::bind(ui8_fill_with, recurrent_to_forget, + std::placeholders::_1, std::placeholders::_2), + weight_tensor.params, &new_tensor_index); + + mapping_args.builder->AddNewInputConstantTensor( + ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8, + recurrent_weight_dims, + std::bind(ui8_fill_with, recurrent_to_cell, + std::placeholders::_1, std::placeholders::_2), + weight_tensor.params, &new_tensor_index); + + mapping_args.builder->AddNewInputConstantTensor( + ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, kTfLiteUInt8, + recurrent_weight_dims, + std::bind(ui8_fill_with, recurrent_to_output, + std::placeholders::_1, std::placeholders::_2), + weight_tensor.params, &new_tensor_index); + + TfLiteIntArrayFree(input_weight_dims); + TfLiteIntArrayFree(recurrent_weight_dims); + + // Biases have to be split in four + const auto i32_fill_with = + [](const std::vector& read_from, + TfLitePtrUnion write_to, int64_t size) -> TfLiteStatus { + std::copy(read_from.begin(), read_from.end(), write_to.i32); + return kTfLiteOk; + }; + + const auto bias_size = output_dims->data[1]; + const TfLiteTensor& biases_tensor = + mapping_args.context->tensors + [mapping_args.node->inputs->data[3 /* kInputBiases */]]; + + std::vector input_bias; + std::vector cell_bias; + std::vector forget_bias; + std::vector output_bias; + delegate::nnapi::DecomposeBiasTensor( + biases_tensor.data.i32, bias_size, &input_bias, &cell_bias, + &forget_bias, &output_bias); + + int input_bias_tensor = -1; + mapping_args.builder->AddNewInputConstantTensor( + ANEURALNETWORKS_TENSOR_INT32, kTfLiteInt32, {bias_size}, + std::bind(i32_fill_with, input_bias, std::placeholders::_1, + std::placeholders::_2), + biases_tensor.params, &input_bias_tensor); + // kForgetGateBiasTensor + int forget_bias_tensor = -1; + mapping_args.builder->AddNewInputConstantTensor( + ANEURALNETWORKS_TENSOR_INT32, kTfLiteInt32, {bias_size}, + std::bind(i32_fill_with, forget_bias, std::placeholders::_1, + std::placeholders::_2), + biases_tensor.params, &forget_bias_tensor); + // kCellGateBiasTensor + int cell_gate_bias_tensor = -1; + mapping_args.builder->AddNewInputConstantTensor( + ANEURALNETWORKS_TENSOR_INT32, kTfLiteInt32, {bias_size}, + std::bind(i32_fill_with, cell_bias, std::placeholders::_1, + std::placeholders::_2), + biases_tensor.params, &cell_gate_bias_tensor); + // kOutputGateBiasTensor + int output_gate_bias_tensor = -1; + mapping_args.builder->AddNewInputConstantTensor( + ANEURALNETWORKS_TENSOR_INT32, kTfLiteInt32, {bias_size}, + std::bind(i32_fill_with, output_bias, std::placeholders::_1, + std::placeholders::_2), + biases_tensor.params, &output_gate_bias_tensor); + + mapping_args.builder->AddTensorInput( + mapping_args.node->inputs->data[4 /* kInputPrevState */], + /* hybrid_op */ false, + /* scalar_as_tensor */ false); + + // kInputPrevActivation + mapping_args.builder->AddTensorInput( + mapping_args.node->inputs->data[1 /* kInputPrevActivation */], + /* hybrid_op */ false, + /* scalar_as_tensor */ false); + + // Configuring the copy from the activation, state outputs + // to their associated inputs + mapping_args.feedback_loops->push_back(std::make_tuple( + 0 /*kOutputActivation*/, 1 /*kInputPrevActivation*/)); + + mapping_args.feedback_loops->push_back( + std::make_tuple(1 /*kOutputState*/, 4 /*kInputPrevState*/)); + + // OUTPUTS + // Setting only the first two since the remaining ones are + // ignored by NNAPI + mapping_args.builder->AddTensorOutput( + mapping_args.node->outputs->data[1 /* kOutputState */], 0); + + mapping_args.builder->AddTensorOutput( + mapping_args.node->outputs + ->data[0 /* kOutputkOutputActivationState */], + 0); + + return ANEURALNETWORKS_QUANTIZED_16BIT_LSTM; + }; } if (node->inputs->size == 24 && android_sdk_version < kMinSdkVersionForNNAPI12) { // LSTM with layer norm introduced in API level 29 return nullptr; } - const TfLiteType weight_type = - context - ->tensors[node->inputs - ->data[/*kInputToOutputWeightsTensor*/ 4]] - .type; if (weight_type != kTfLiteFloat32 && weight_type != kTfLiteUInt8) { return nullptr; } @@ -2358,6 +2684,11 @@ class NNAPIDelegateKernel { int relative_output_index = 0; size_t output_offset = 0; for (auto output_index : TfLiteIntArrayView(node->outputs)) { + // If the NNAPI implementation doesn't have some of the outputs + // they are left unmapped and we should not try to read their value here + if (operand_mapping_.lite_index_to_ann(output_index) == -1) { + continue; + } TfLiteTensor* tensor = &context->tensors[output_index]; if (tensor->buffer_handle != kTfLiteNullBufferHandle && tensor->buffer_handle < tensor_memory_map_->size()) { @@ -2432,6 +2763,20 @@ class NNAPIDelegateKernel { output_offset += getNumPaddingBytes(tensor->bytes); } + // copy output of all output tensors in feedback_loops_ into the + // associated input + for (auto feedback_loop : feedback_loops_) { + int output_tensor_idx; + int input_tensor_idx; + std::tie(output_tensor_idx, input_tensor_idx) = feedback_loop; + TfLiteTensor* src = + &context->tensors[node->outputs->data[output_tensor_idx]]; + TfLiteTensor* dest = + &context->tensors[node->inputs->data[input_tensor_idx]]; + + memcpy(dest->data.raw, src->data.raw, src->bytes); + } + return kTfLiteOk; } @@ -2456,6 +2801,10 @@ class NNAPIDelegateKernel { tensor_memory_map_; std::vector model_state_outputs_; std::vector model_state_tfl_inputs_; + // This is the equivalent of the pair model_state_outputs_, + // model_state_tfl_inputs_ for all tensors where we have to keep the output + // data available for TFLite model users + std::vector> feedback_loops_; std::unique_ptr nn_input_memory_; std::unique_ptr nn_output_memory_; @@ -2552,13 +2901,19 @@ class NNAPIDelegateKernel { input_tensor_flags | NN_TENSOR_FLAG_INT8_CONVERSION)); continue; } - if (reg->builtin_code == kTfLiteBuiltinLstm && input_pos >= 20) { + if (reg->builtin_code == kTfLiteBuiltinLstm && isLstmFullKernel(node) && + input_pos >= 20) { // Skip layer normalization weights. They are added in the Map // function (after all the other inputs added there) since layer // normalization weights are the last four inputs of the LSTM op in // NNAPI. continue; } + if (reg->builtin_code == kTfLiteBuiltinLstm && + isLstmBasicKernel(node)) { + // Configuring all inputs in the Map function + continue; + } if (reg->builtin_code == kTfLiteBuiltinUnidirectionalSequenceLstm) { if (input_pos >= 20) { // Skip layer normalization weights. They are added in the Map @@ -2694,13 +3049,21 @@ class NNAPIDelegateKernel { int nn_op_type = Map( context, reg->builtin_code, reg->version, nnapi_->android_sdk_version, node)({context, &builder, node, &model_state_outputs_, - &model_state_tfl_inputs_}); + &model_state_tfl_inputs_, &feedback_loops_}); // Map outputs to NN API tensor indices. int output_tensor_flags = 0; if (need_int8_conversion) { output_tensor_flags |= NN_TENSOR_FLAG_INT8_CONVERSION; } - for (auto output_index : TfLiteIntArrayView(node->outputs)) { + for (int output_pos = 0; output_pos < node->outputs->size; ++output_pos) { + const auto output_index = node->outputs->data[output_pos]; + + // Outputs for basic LSTM cell are set in the Map function since + if (reg->builtin_code == kTfLiteBuiltinLstm && + isLstmBasicKernel(node)) { + continue; + } + TF_LITE_ENSURE_STATUS( builder.AddTensorOutput(output_index, output_tensor_flags)); } @@ -2731,7 +3094,10 @@ class NNAPIDelegateKernel { for (int i : TfLiteIntArrayView(input_tensors)) { // Constant tensors are not NNAPI inputs. if (i != kOptionalTensor && - context->tensors[i].allocation_type != kTfLiteMmapRo) { + context->tensors[i].allocation_type != kTfLiteMmapRo && + // The delegate might not have mapped this input (this can + // happen if one tensor is split in several ones) + operand_mapping_.lite_index_to_ann(i) != -1) { inputs.push_back(operand_mapping_.lite_index_to_ann(i)); if (context->tensors[i].buffer_handle != kTfLiteNullBufferHandle) { continue; @@ -2754,7 +3120,11 @@ class NNAPIDelegateKernel { size_t total_output_byte_size = 0; for (int i : TfLiteIntArrayView(output_tensors)) { - outputs.push_back(operand_mapping_.lite_index_to_ann(i)); + const int output_tensor_ann_index = operand_mapping_.lite_index_to_ann(i); + // Unmapped outputs are not added + if (output_tensor_ann_index != -1) { + outputs.push_back(output_tensor_ann_index); + } if (context->tensors[i].buffer_handle != kTfLiteNullBufferHandle) { continue; } diff --git a/tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc b/tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc new file mode 100644 index 00000000000..c79c404c360 --- /dev/null +++ b/tensorflow/lite/delegates/nnapi/quant_lstm_sup.cc @@ -0,0 +1,153 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/lite/delegates/nnapi/quant_lstm_sup.h" + +#include + +#include "tensorflow/lite/context_util.h" +#include "tensorflow/lite/kernels/kernel_util.h" + +namespace tflite { +namespace delegate { +namespace nnapi { + +// The function extracts a submatrix of the weights at a given row +// and column offsets from a 2D matrix +void ExtractQuantLstmWeightsSubmatrix(const TfLiteIntArray* submatrix_dims, + const int32_t offset_row, + const int32_t offset_column, + const TfLiteIntArray* weight_dims, + const uint8_t* weights, + std::vector* submatrix) { + auto const& submatrix_rows = submatrix_dims->data[0]; + auto const& submatrix_cols = submatrix_dims->data[1]; + auto const& weight_cols = weight_dims->data[1]; + + submatrix->resize(NumElements(submatrix_dims)); + + for (uint32_t i = 0; i < submatrix_rows * submatrix_cols; ++i) { + const uint32_t row = i / submatrix_cols; + const uint32_t column = i % submatrix_cols; + (*submatrix)[i] = + weights[(row + offset_row) * weight_cols + column + offset_column]; + } +} + +inline int OutputDepth(const TfLiteIntArray* weight_dims) { + return weight_dims->data[0] / 4; +} + +inline int InputDepth(const TfLiteIntArray* weight_dims) { + return weight_dims->data[1] - OutputDepth(weight_dims); +} + +void SetWeightSubmatrixDims(const TfLiteIntArray* weight_dims, + TfLiteIntArray* recurrent_submatrix_dims, + TfLiteIntArray* input_submatrix_dims) { + const auto input_depth = InputDepth(weight_dims); + const auto output_depth = OutputDepth(weight_dims); + + recurrent_submatrix_dims->data[0] = output_depth; + recurrent_submatrix_dims->data[1] = output_depth; + + input_submatrix_dims->data[0] = output_depth; + input_submatrix_dims->data[1] = input_depth; +} + +// Doing exactly the opposite work of QuantizedLSTMCell::concatenateWeights +// in NNAPI, decomposing the concat_weights tensor data into its 8 components +// according to the following diagram +// +// +-----------------------------------+ +// | recurrentToInput | inputToInput | +// |-------------------+---------------| +// | recurrentToCell | inputToCell | +// |-------------------+---------------| +// | recurrentToForget | inputToForget | +// |-------------------+---------------| +// | recurrentToOutput | inputToOutput | +// +-----------------------------------+ +void DecomposeQuantLstmWeightsTensor(const uint8_t* concat_weights, + const TfLiteIntArray* weight_dims, + std::vector* recurrent_to_input, + std::vector* input_to_input, + std::vector* recurrent_to_cell, + std::vector* input_to_cell, + std::vector* recurrent_to_forget, + std::vector* input_to_forget, + std::vector* recurrent_to_output, + std::vector* input_to_output) { + const auto output_depth = OutputDepth(weight_dims); + + TfLiteIntArray* recurrent_submatrix_dims = TfLiteIntArrayCreate(2); + TfLiteIntArray* input_submatrix_dims = TfLiteIntArrayCreate(2); + SetWeightSubmatrixDims(weight_dims, recurrent_submatrix_dims, + input_submatrix_dims); + + ExtractQuantLstmWeightsSubmatrix(recurrent_submatrix_dims, 0 * output_depth, + 0, weight_dims, concat_weights, + recurrent_to_input); + ExtractQuantLstmWeightsSubmatrix(input_submatrix_dims, 0 * output_depth, + output_depth, weight_dims, concat_weights, + input_to_input); + + ExtractQuantLstmWeightsSubmatrix(recurrent_submatrix_dims, 1 * output_depth, + 0, weight_dims, concat_weights, + recurrent_to_cell); + ExtractQuantLstmWeightsSubmatrix(input_submatrix_dims, 1 * output_depth, + output_depth, weight_dims, concat_weights, + input_to_cell); + + ExtractQuantLstmWeightsSubmatrix(recurrent_submatrix_dims, 2 * output_depth, + 0, weight_dims, concat_weights, + recurrent_to_forget); + ExtractQuantLstmWeightsSubmatrix(input_submatrix_dims, 2 * output_depth, + output_depth, weight_dims, concat_weights, + input_to_forget); + + ExtractQuantLstmWeightsSubmatrix(recurrent_submatrix_dims, 3 * output_depth, + 0, weight_dims, concat_weights, + recurrent_to_output); + ExtractQuantLstmWeightsSubmatrix(input_submatrix_dims, 3 * output_depth, + output_depth, weight_dims, concat_weights, + input_to_output); + + TfLiteIntArrayFree(recurrent_submatrix_dims); + TfLiteIntArrayFree(input_submatrix_dims); +} + +void DecomposeBiasTensor(const int32_t* biases, int bias_size, + std::vector* input_bias, + std::vector* cell_bias, + std::vector* forget_bias, + std::vector* output_bias) { + input_bias->resize(bias_size); + std::copy(biases, biases + bias_size, input_bias->begin()); + + cell_bias->resize(bias_size); + std::copy(biases + bias_size, biases + 2 * bias_size, cell_bias->begin()); + + forget_bias->resize(bias_size); + std::copy(biases + 2 * bias_size, biases + 3 * bias_size, + forget_bias->begin()); + + output_bias->resize(bias_size); + std::copy(biases + 3 * bias_size, biases + 4 * bias_size, + output_bias->begin()); +} + +} // namespace nnapi +} // namespace delegate +} // namespace tflite diff --git a/tensorflow/lite/delegates/nnapi/quant_lstm_sup.h b/tensorflow/lite/delegates/nnapi/quant_lstm_sup.h new file mode 100644 index 00000000000..1385b92fc51 --- /dev/null +++ b/tensorflow/lite/delegates/nnapi/quant_lstm_sup.h @@ -0,0 +1,58 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_DELEGATES_NNAPI_QUANT_LSTM_SUP_H_ +#define TENSORFLOW_LITE_DELEGATES_NNAPI_QUANT_LSTM_SUP_H_ + +#include + +#include "tensorflow/lite/c/c_api_internal.h" + +namespace tflite { +namespace delegate { +namespace nnapi { + +void ExtractQuantLstmWeightsSubmatrix(const TfLiteIntArray* submatrix_dims, + const int32_t offset_row, + const int32_t offset_column, + const TfLiteIntArray* weight_dims, + const uint8_t* weights, + std::vector* submatrix); + +void DecomposeQuantLstmWeightsTensor(const uint8_t* concat_weights, + const TfLiteIntArray* weight_dims, + std::vector* recurrent_to_input, + std::vector* input_to_input, + std::vector* recurrent_to_cell, + std::vector* input_to_cell, + std::vector* recurrent_to_forget, + std::vector* input_to_forget, + std::vector* recurrent_to_output, + std::vector* input_to_output); + +void SetWeightSubmatrixDims(const TfLiteIntArray* weight_dims, + TfLiteIntArray* recurrent_submatrix_dims, + TfLiteIntArray* input_submatrix_dims); + +void DecomposeBiasTensor(const int32_t* biases, int bias_size, + std::vector* input_bias, + std::vector* cell_bias, + std::vector* forget_bias, + std::vector* output_bias); + +} // namespace nnapi +} // namespace delegate +} // namespace tflite + +#endif // TENSORFLOW_LITE_DELEGATES_NNAPI_QUANT_LSTM_SUP_H_ diff --git a/tensorflow/lite/delegates/nnapi/quant_lstm_sup_test.cc b/tensorflow/lite/delegates/nnapi/quant_lstm_sup_test.cc new file mode 100644 index 00000000000..2bbf52c147e --- /dev/null +++ b/tensorflow/lite/delegates/nnapi/quant_lstm_sup_test.cc @@ -0,0 +1,344 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "tensorflow/lite/delegates/nnapi/quant_lstm_sup.h" + +#include +#include +#include +#include + +#include +#include +#include "tensorflow/lite/c/c_api_internal.h" +#include "tensorflow/lite/testing/util.h" + +namespace { + +using ::testing::ElementsAreArray; +using ::testing::Test; + +class DimsAllocatingTest : public Test { + protected: + DimsAllocatingTest() : allocated_dims_() {} + + ~DimsAllocatingTest() override { + for (TfLiteIntArray* dim : allocated_dims_) { + TfLiteIntArrayFree(dim); + } + } + + TfLiteIntArray* CreateDimArray(int size, + std::initializer_list dimensions) { + TfLiteIntArray* dims = TfLiteIntArrayCreate(size); + allocated_dims_.push_back(dims); + + int i = 0; + for (const int dimension : dimensions) { + dims->data[i++] = dimension; + } + + return dims; + } + + private: + std::vector allocated_dims_; +}; + +using tflite::delegate::nnapi::ExtractQuantLstmWeightsSubmatrix; + +class ExtractQuantLstmWeightsSubmatrixTest : public DimsAllocatingTest {}; + +TEST_F(ExtractQuantLstmWeightsSubmatrixTest, TopLeftSubmatrixIsExtracted) { + std::vector weights = {1, 2, 3, 4, 5, // + 11, 12, 13, 14, 15, // + 101, 102, 103, 104, 105, // + 111, 112, 113, 114, 115, // + 201, 202, 203, 204, 205, // + 211, 212, 213, 214, 215, // + 221, 222, 223, 224, 225, // + 231, 232, 233, 234, 235}; + const TfLiteIntArray* weight_dims = CreateDimArray(2, {8, 5}); + + std::vector submatrix; + const TfLiteIntArray* submatrix_dims = CreateDimArray(2, {2, 3}); + + ExtractQuantLstmWeightsSubmatrix(submatrix_dims, 0 /* offset_row */, + 0 /* offset_column */, weight_dims, + weights.data(), &submatrix); + + EXPECT_THAT(submatrix, ElementsAreArray({1, 2, 3, 11, 12, 13})); +} + +TEST_F(ExtractQuantLstmWeightsSubmatrixTest, TopRightSubmatrixIsExtracted) { + std::vector weights = {1, 2, 3, 4, 5, // + 11, 12, 13, 14, 15, // + 101, 102, 103, 104, 105, // + 111, 112, 113, 114, 115, // + 201, 202, 203, 204, 205, // + 211, 212, 213, 214, 215, // + 221, 222, 223, 224, 225, // + 231, 232, 233, 234, 235}; + const TfLiteIntArray* weight_dims = CreateDimArray(2, {8, 5}); + + std::vector submatrix; + const TfLiteIntArray* submatrix_dims = CreateDimArray(2, {2, 2}); + + ExtractQuantLstmWeightsSubmatrix(submatrix_dims, 0 /* offset_row */, + 3 /* offset_column */, weight_dims, + weights.data(), &submatrix); + + EXPECT_THAT(submatrix, ElementsAreArray({4, 5, 14, 15})); +} + +TEST_F(ExtractQuantLstmWeightsSubmatrixTest, RightCentralSubmatrixIsExtracted) { + std::vector weights = {1, 2, 3, 4, 5, // + 11, 12, 13, 14, 15, // + 101, 102, 103, 104, 105, // + 111, 112, 113, 114, 115, // + 201, 202, 203, 204, 205, // + 211, 212, 213, 214, 215, // + 221, 222, 223, 224, 225, // + 231, 232, 233, 234, 235}; + const TfLiteIntArray* weight_dims = CreateDimArray(2, {8, 5}); + + std::vector submatrix; + const TfLiteIntArray* submatrix_dims = CreateDimArray(2, {2, 2}); + + ExtractQuantLstmWeightsSubmatrix( + submatrix_dims, 1 * submatrix_dims->data[0] /* offset_row */, + 3 /* offset_column */, weight_dims, weights.data(), &submatrix); + + EXPECT_THAT(submatrix, ElementsAreArray({104, 105, 114, 115})); +} + +using tflite::delegate::nnapi::DecomposeQuantLstmWeightsTensor; + +class QuantLstmWeightDecompTest : public DimsAllocatingTest { + protected: + QuantLstmWeightDecompTest() + : weights_({1, 2, 3, 4, 5, // + 11, 12, 13, 14, 15, // + 101, 102, 103, 104, 105, // + 111, 112, 113, 114, 115, // + 201, 202, 203, 204, 205, // + 211, 212, 213, 214, 215, // + 221, 222, 223, 224, 225, // + 231, 232, 233, 234, 235}), + // Creating the arrays empty, the size is set by the decomposition + // function + recurrent_to_input_(), + input_to_input_(), + recurrent_to_cell_(), + input_to_cell_(), + recurrent_to_forget_(), + input_to_forget_(), + recurrent_to_output_(), + input_to_output_() { + weight_dims_ = CreateDimArray(2, {8, 5}); + } + + const std::vector weights_; + const TfLiteIntArray* weight_dims_; + std::vector recurrent_to_input_; + std::vector input_to_input_; + std::vector recurrent_to_cell_; + std::vector input_to_cell_; + std::vector recurrent_to_forget_; + std::vector input_to_forget_; + std::vector recurrent_to_output_; + std::vector input_to_output_; +}; + +TEST_F(QuantLstmWeightDecompTest, ExtractRecurrentToInput) { + DecomposeQuantLstmWeightsTensor( + weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_, + &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_, + &input_to_forget_, &recurrent_to_output_, &input_to_output_); + + EXPECT_THAT(recurrent_to_input_, ElementsAreArray({1, 2, // + 11, 12})); +} + +TEST_F(QuantLstmWeightDecompTest, ExtractInputToInput) { + DecomposeQuantLstmWeightsTensor( + weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_, + &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_, + &input_to_forget_, &recurrent_to_output_, &input_to_output_); + + EXPECT_THAT(input_to_input_, ElementsAreArray({3, 4, 5, // + 13, 14, 15})); +} + +TEST_F(QuantLstmWeightDecompTest, ExtractRecurrentToCell) { + DecomposeQuantLstmWeightsTensor( + weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_, + &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_, + &input_to_forget_, &recurrent_to_output_, &input_to_output_); + + EXPECT_THAT(recurrent_to_cell_, ElementsAreArray({101, 102, // + 111, 112})); +} + +TEST_F(QuantLstmWeightDecompTest, ExtractInputToCell) { + DecomposeQuantLstmWeightsTensor( + weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_, + &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_, + &input_to_forget_, &recurrent_to_output_, &input_to_output_); + + EXPECT_THAT(input_to_cell_, ElementsAreArray({103, 104, 105, // + 113, 114, 115})); +} + +TEST_F(QuantLstmWeightDecompTest, ExtractRecurrentToForget) { + DecomposeQuantLstmWeightsTensor( + weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_, + &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_, + &input_to_forget_, &recurrent_to_output_, &input_to_output_); + + EXPECT_THAT(recurrent_to_forget_, ElementsAreArray({201, 202, // + 211, 212})); +} + +TEST_F(QuantLstmWeightDecompTest, ExtractInputToForget) { + DecomposeQuantLstmWeightsTensor( + weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_, + &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_, + &input_to_forget_, &recurrent_to_output_, &input_to_output_); + + EXPECT_THAT(input_to_forget_, ElementsAreArray({203, 204, 205, // + 213, 214, 215})); +} + +TEST_F(QuantLstmWeightDecompTest, ExtractRecurrentToOutput) { + DecomposeQuantLstmWeightsTensor( + weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_, + &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_, + &input_to_forget_, &recurrent_to_output_, &input_to_output_); + + EXPECT_THAT(recurrent_to_output_, ElementsAreArray({221, 222, // + 231, 232})); +} + +TEST_F(QuantLstmWeightDecompTest, ExtractInputToOutput) { + DecomposeQuantLstmWeightsTensor( + weights_.data(), weight_dims_, &recurrent_to_input_, &input_to_input_, + &recurrent_to_cell_, &input_to_cell_, &recurrent_to_forget_, + &input_to_forget_, &recurrent_to_output_, &input_to_output_); + + EXPECT_THAT(input_to_output_, ElementsAreArray({223, 224, 225, // + 233, 234, 235})); +} + +using tflite::delegate::nnapi::DecomposeBiasTensor; + +TEST(DecomposeBiasTensor, ExtractInputBias) { + // clang-format off + std::vector biases + // inputGateBias + {-7876, 13488, -726, 32839, + // cellGateBias + 39481, 48624, 48976, -21419, + // forgetGateBias + 9206, -46884, -11693, -38724, + // outputGateBias + -58999, -17050, -41852, -40538}; + // clang-format on + + std::vector input_bias; + std::vector cell_bias; + std::vector forget_bias; + std::vector output_bias; + DecomposeBiasTensor(biases.data(), 4, &input_bias, &cell_bias, &forget_bias, + &output_bias); + + EXPECT_THAT(input_bias, ElementsAreArray({-7876, 13488, -726, 32839})); +} + +TEST(DecomposeBiasTensor, ExtractCellBias) { + // clang-format off + std::vector biases + // inputGateBias + {-7876, 13488, -726, 32839, + // cellGateBias + 39481, 48624, 48976, -21419, + // forgetGateBias + 9206, -46884, -11693, -38724, + // outputGateBias + -58999, -17050, -41852, -40538}; + // clang-format on + + std::vector input_bias; + std::vector cell_bias; + std::vector forget_bias; + std::vector output_bias; + DecomposeBiasTensor(biases.data(), 4, &input_bias, &cell_bias, &forget_bias, + &output_bias); + + EXPECT_THAT(cell_bias, ElementsAreArray({39481, 48624, 48976, -21419})); +} + +TEST(DecomposeBiasTensor, ExtractForgetBias) { + // clang-format off + std::vector biases + // inputGateBias + {-7876, 13488, -726, 32839, + // cellGateBias + 39481, 48624, 48976, -21419, + // forgetGateBias + 9206, -46884, -11693, -38724, + // outputGateBias + -58999, -17050, -41852, -40538}; + // clang-format on + + std::vector input_bias; + std::vector cell_bias; + std::vector forget_bias; + std::vector output_bias; + DecomposeBiasTensor(biases.data(), 4, &input_bias, &cell_bias, &forget_bias, + &output_bias); + + EXPECT_THAT(forget_bias, ElementsAreArray({9206, -46884, -11693, -38724})); +} + +TEST(DecomposeBiasTensor, ExtractOutputBias) { + // clang-format off + std::vector biases + // inputGateBias + {-7876, 13488, -726, 32839, + // cellGateBias + 39481, 48624, 48976, -21419, + // forgetGateBias + 9206, -46884, -11693, -38724, + // outputGateBias + -58999, -17050, -41852, -40538}; + // clang-format on + + std::vector input_bias; + std::vector cell_bias; + std::vector forget_bias; + std::vector output_bias; + DecomposeBiasTensor(biases.data(), 4, &input_bias, &cell_bias, &forget_bias, + &output_bias); + + EXPECT_THAT(output_bias, ElementsAreArray({-58999, -17050, -41852, -40538})); +} + +} // namespace + +int main(int argc, char** argv) { + ::tflite::LogToStderr(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD index bca715a8ce5..4d3876ec0e5 100644 --- a/tensorflow/lite/kernels/BUILD +++ b/tensorflow/lite/kernels/BUILD @@ -1836,3 +1836,18 @@ cc_test( "@com_google_googletest//:gtest", ], ) + +cc_test( + name = "quant_basic_lstm_test", + size = "small", + srcs = ["quant_basic_lstm_test.cc"], + tags = ["tflite_nnapi"], + deps = [ + ":builtin_ops", + ":kernel_util", + ":test_main", + ":test_util", + "//tensorflow/lite:framework", + "@com_google_googletest//:gtest", + ], +) diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h index a76d925c3bf..3b62c4d691b 100644 --- a/tensorflow/lite/kernels/kernel_util.h +++ b/tensorflow/lite/kernels/kernel_util.h @@ -54,14 +54,18 @@ inline int NumIntermediates(const TfLiteNode* node) { return node->intermediates->size; } -inline int64_t NumElements(const TfLiteTensor* t) { +inline int64_t NumElements(const TfLiteIntArray* dims) { int64_t count = 1; - for (int i = 0; i < NumDimensions(t); ++i) { - count *= SizeOfDimension(t, i); + for (int i = 0; i < dims->size; ++i) { + count *= dims->data[i]; } return count; } +inline int64_t NumElements(const TfLiteTensor* t) { + return NumElements(t->dims); +} + inline const TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context, const TfLiteNode* node, int index) { diff --git a/tensorflow/lite/kernels/quant_basic_lstm_test.cc b/tensorflow/lite/kernels/quant_basic_lstm_test.cc new file mode 100644 index 00000000000..e8f7ad3fc58 --- /dev/null +++ b/tensorflow/lite/kernels/quant_basic_lstm_test.cc @@ -0,0 +1,230 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include +#include +#include + +#include +#include +#include "tensorflow/lite/interpreter.h" +#include "tensorflow/lite/kernels/kernel_util.h" +#include "tensorflow/lite/kernels/register.h" +#include "tensorflow/lite/kernels/test_util.h" +#include "tensorflow/lite/model.h" + +namespace tflite { +namespace { + +using ::testing::ElementsAreArray; + +class QuantizedLSTMOpModel : public SingleOpModel { + public: + QuantizedLSTMOpModel(int numBatches, int inputSize, float weightsScale, + int32_t weightsZeroPoint, int outputSize, + std::initializer_list weights, + std::initializer_list biases) { + std::vector inputs; + + input_size_ = inputSize; + output_size_ = outputSize; + + std::vector input_shape{numBatches, inputSize}; + std::vector output_shape{numBatches, outputSize}; + std::vector weight_shape{4 * outputSize, outputSize + inputSize}; + std::vector state_shape{numBatches, outputSize}; + std::vector bias_shape{4 * outputSize}; + + input_ = + AddInput({TensorType_UINT8, input_shape, 0.0f, 0.0f, 1. / 128., 128}); + prev_output_ = + AddInput({TensorType_UINT8, output_shape, 0.0f, 0.0f, 1. / 128., 128}); + // Biases and Weights have to be constant in order to allow NNAPI + // delegation + weights_ = AddConstInput({TensorType_UINT8, weight_shape, 0.0f, + 0.0f, weightsScale, weightsZeroPoint}, + weights); + biases_ = AddConstInput( + {TensorType_INT32, bias_shape, 0.0f, 0.0f, weightsScale / 128, 0}, + biases); + prev_cell_state_ = + AddInput({TensorType_INT16, state_shape, 0.0f, 0.0f, 1. / 2048., 0}); + + output_ = + AddOutput({TensorType_UINT8, output_shape, 0.0f, 0.0f, 1. / 128., 128}); + cell_state_out_ = + AddOutput({TensorType_INT16, state_shape, 0.0f, 0.0f, 1. / 2048., 0}); + output_concat_temp_ = + AddOutput({TensorType_UINT8, output_shape, 0.0f, 0.0f, 1. / 128., 128}); + output_activation_temp_ = + AddOutput({TensorType_INT16, output_shape, 0.0f, 0.0f, 1. / 128., 128}); + + SetBuiltinOp(BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions, + CreateLSTMOptions(builder_, ActivationFunctionType_TANH, 0.0, + 0.0, LSTMKernelType_BASIC) + .Union()); + + BuildInterpreter({GetShape(input_), GetShape(prev_output_), + GetShape(weights_), GetShape(biases_), + GetShape(prev_cell_state_)}); + + // init feedback inputs to zero + std::vector initial_state(GetTensorSize(cell_state_out_), 0); + PopulateTensor(prev_cell_state_, initial_state); + std::vector initial_prev_output(GetTensorSize(output_), 0); + PopulateTensor(prev_output_, initial_prev_output); + } + + int inputSize() { return input_size_; } + + int outputSize() { return output_size_; } + + void setInput(const std::vector& input) { + PopulateTensor(input_, input); + } + + std::vector getOutput() { return ExtractVector(output_); } + + private: + // Inputs + int input_; + int weights_; + int biases_; + int prev_cell_state_; + int prev_output_; + // Outputs + int cell_state_out_; + int output_; + int output_concat_temp_; + int output_activation_temp_; + + int input_size_; + int output_size_; +}; + +class QuantizedLstmTest : public ::testing::Test { + protected: + void VerifyGoldens(const std::vector>& input, + const std::vector>& output, + QuantizedLSTMOpModel* lstm) { + const int numBatches = input.size(); + ASSERT_GT(numBatches, 0); + const int inputSize = lstm->inputSize(); + ASSERT_GT(inputSize, 0); + const int inputSequenceSize = input[0].size() / inputSize; + ASSERT_GT(inputSequenceSize, 0); + for (int i = 0; i < inputSequenceSize; ++i) { + std::vector inputStep; + for (int b = 0; b < numBatches; ++b) { + const uint8_t* batchStart = input[b].data() + i * inputSize; + const uint8_t* batchEnd = batchStart + inputSize; + inputStep.insert(inputStep.end(), batchStart, batchEnd); + } + lstm->setInput(inputStep); + lstm->Invoke(); + + const int outputSize = lstm->outputSize(); + std::vector expected; + for (int b = 0; b < numBatches; ++b) { + const uint8_t* goldenBatchStart = output[b].data() + i * outputSize; + const uint8_t* goldenBatchEnd = goldenBatchStart + outputSize; + expected.insert(expected.end(), goldenBatchStart, goldenBatchEnd); + } + EXPECT_THAT(lstm->getOutput(), ElementsAreArray(expected)); + } + } +}; + +// Inputs and weights in this test are random and the test only checks that the +// outputs are equal to outputs obtained from running TF Lite version of +// quantized LSTM on the same inputs. +TEST_F(QuantizedLstmTest, BasicQuantizedLstmTest) { + const int numBatches = 2; + const int inputSize = 2; + const int outputSize = 4; + + float weightsScale = 0.00408021; + int weightsZeroPoint = 100; + + QuantizedLSTMOpModel lstm( + numBatches, inputSize, weightsScale, weightsZeroPoint, outputSize, + + // This data are copied from QuantizedLSTMTest.cpp in NNAPI source code + // I have to recompose the weight matrix before passing it to the model + + // recurrentToInputWeights inputToInputWeights + {254, 206, 77, 168, 146, 250, 71, 20, 215, 6, 235, 171, 223, 7, 118, 225, + 10, 218, 59, 130, 174, 26, 171, 108, + + // recurrentToCellWeights inputToCellWeights + 172, 60, 205, 65, 133, 34, 14, 0, 140, 168, 29, 49, 240, 223, 133, 56, + 206, 109, 142, 64, 246, 216, 54, 183, + + // recurrentToForgetWeights inputToForgetWeights + 137, 240, 103, 52, 24, 50, 68, 51, 237, 112, 132, 179, 0, 220, 89, 23, + 158, 110, 69, 4, 207, 253, 3, 169, + + // recurrentToOutputWeights inputToOutputWeights + 106, 214, 67, 23, 195, 187, 59, 158, 45, 3, 11, 99, 119, 132, 49, 205, + 109, 10, 129, 218, 11, 98, 218, 48}, + + // inputGateBias + {-7876, 13488, -726, 32839, + // cellGateBias + 39481, 48624, 48976, -21419, + // forgetGateBias + 9206, -46884, -11693, -38724, + // outputGateBias + -58999, -17050, -41852, -40538}); + // clang-format on + + // LSTM input is stored as numBatches x (sequenceLength x inputSize) vector. + std::vector> lstmInput; + // clang-format off + lstmInput = {{154, 166, + 166, 179, + 141, 141}, + {100, 200, + 50, 150, + 111, 222}}; + // clang-format on + + // LSTM output is stored as numBatches x (sequenceLength x outputSize) vector. + std::vector> lstmGoldenOutput; + /* + This is the output used in NNAPI's QuantizedLSTMTest.cpp + I get slightly different values that are consistent running with or + without acceleration + + lstmGoldenOutput = {{136, 150, 140, 115, + 140, 151, 146, 112, + 139, 153, 146, 114}, + {135, 152, 138, 112, + 136, 156, 142, 112, + 141, 154, 146, 108}}; + */ + + // clang-format off + lstmGoldenOutput = {{131, 152, 136, 109, + 138, 150, 145, 111, + 139, 152, 146, 113}, + {131, 153, 135, 107, + 134, 154, 140, 111, + 140, 154, 145, 108}}; + // clang-format on + VerifyGoldens(lstmInput, lstmGoldenOutput, &lstm); +} + +} // namespace +} // namespace tflite diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h index fc8d2486837..95a313f8456 100644 --- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h +++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h @@ -41,6 +41,7 @@ enum { ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5, ANEURALNETWORKS_BOOL = 6, ANEURALNETWORKS_TENSOR_BOOL8 = 9, + ANEURALNETWORKS_TENSOR_QUANT16_SYMM = 7, ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL = 11, ANEURALNETWORKS_TENSOR_QUANT8_SYMM = 13, }; @@ -115,6 +116,7 @@ enum { ANEURALNETWORKS_POW = 70, ANEURALNETWORKS_PRELU = 71, ANEURALNETWORKS_QUANTIZE = 72, + ANEURALNETWORKS_QUANTIZED_16BIT_LSTM = 73, ANEURALNETWORKS_REDUCE_ANY = 76, ANEURALNETWORKS_REDUCE_MAX = 77, ANEURALNETWORKS_REDUCE_MIN = 78, diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile index 89ef6e5c302..c37b7cf67a5 100644 --- a/tensorflow/lite/tools/make/Makefile +++ b/tensorflow/lite/tools/make/Makefile @@ -164,6 +164,7 @@ endif ifeq ($(BUILD_WITH_NNAPI),true) CORE_CC_ALL_SRCS += tensorflow/lite/delegates/nnapi/nnapi_delegate.cc CORE_CC_ALL_SRCS += tensorflow/lite/nnapi/nnapi_implementation.cc + CORE_CC_ALL_SRCS += tensorflow/lite/nnapi/quant_lstm_sup.cc else CORE_CC_ALL_SRCS += tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc CORE_CC_ALL_SRCS += tensorflow/lite/nnapi/nnapi_implementation_disabled.cc From 56a12c26c541a104c7899b07874d7830bdb7e158 Mon Sep 17 00:00:00 2001 From: captain-pool Date: Thu, 25 Jul 2019 17:43:20 +0530 Subject: [PATCH 0554/3053] Added repr for TensorSpec --- tensorflow/python/tools/saved_model_cli.py | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py index fc10c8dc9a5..649d0c0bf89 100644 --- a/tensorflow/python/tools/saved_model_cli.py +++ b/tensorflow/python/tools/saved_model_cli.py @@ -212,7 +212,7 @@ def _print_args(arguments, argument_type="Argument", indent=0): if indent == 3: in_print('%s #%d' % (argument_type, index)) if isinstance(element, tensor_spec.TensorSpec): - _print_tensor_spec(element, indent) + print((indent + 1) * ' ' + '%s: %s'%(element.name, repr(element))) elif is_nested(element): in_print(' DType: %s' % type(element).__name__) in_print(' Values: [', end='') @@ -233,26 +233,6 @@ def _print_args(arguments, argument_type="Argument", indent=0): in_print(' DType: %s' % type(element).__name__) in_print(' Value: %s' % str(element)) - -def _print_tensor_spec(tensor_spec, indent=0): - """Prints details of the given tensor_spec. - - Args: - tensor_spec: TensorSpec object to be printed. - indent: How far (in increments of 2 spaces) to indent each line output - """ - indent_str = ' ' * indent - - def in_print(s): - print(indent_str + s) - in_print( - ' %s: Tensor(shape=%s, dtype=%s, name=\'%s\')' % - (tensor_spec.name, - tensor_spec.shape, - tensor_spec.dtype.name, - tensor_spec.name)) - - def _print_tensor_info(tensor_info, indent=0): """Prints details of the given tensor_info. From 3658ff1ada8e97cc887e0ec83313d3750c7bd4e9 Mon Sep 17 00:00:00 2001 From: captain-pool Date: Thu, 25 Jul 2019 17:45:34 +0530 Subject: [PATCH 0555/3053] removed unnecessary bracket --- tensorflow/python/tools/saved_model_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py index 649d0c0bf89..75ce7e71ebd 100644 --- a/tensorflow/python/tools/saved_model_cli.py +++ b/tensorflow/python/tools/saved_model_cli.py @@ -180,7 +180,7 @@ def _show_defined_functions(saved_model_dir): print(' Function Name: \'%s\'' % name) for index, concrete_functions in enumerate( function._list_all_concrete_functions_for_serialization(), 1): - args, kwargs = (concrete_functions.structured_input_signature) + args, kwargs = concrete_functions.structured_input_signature print(' Option #%d' % index) print(' Callable with:') _print_args(args, indent=3) From 5c1df39e1720fe481bb442b98ef7310f096d6f98 Mon Sep 17 00:00:00 2001 From: captain-pool Date: Thu, 25 Jul 2019 18:25:57 +0530 Subject: [PATCH 0556/3053] Fix Dictionaries Arguments not printed --- tensorflow/python/tools/saved_model_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py index 75ce7e71ebd..fe751533584 100644 --- a/tensorflow/python/tools/saved_model_cli.py +++ b/tensorflow/python/tools/saved_model_cli.py @@ -207,7 +207,7 @@ def _print_args(arguments, argument_type="Argument", indent=0): def is_nested(args): return nest.is_nested(args) and not isinstance(args, dict) - if is_nested(arguments): + if nest.is_nested(arguments): for index, element in enumerate(arguments, 1): if indent == 3: in_print('%s #%d' % (argument_type, index)) From e8510ab01da8a9f9ac7691c16cb640a7bfd45526 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Thu, 25 Jul 2019 06:20:42 -0700 Subject: [PATCH 0557/3053] [XLA] Improve thread-safety of HLO snapshot dumping. Currently the code keeps a mutable HloSnapshot attached to the xla::Executable object. This cannot work correctly in the presence of concurrent executions. Instead, keep only an immutable HloProto attached to xla::Executable and construct ephemeral HloSnapshots during dumping. This has the minor downside that it requires copying the HloProto each time we dump, but presumably if you are dumping HLO snapshots you don't particularly care about performance. PiperOrigin-RevId: 259934176 --- .../compiler/xla/client/local_client.cc | 11 ++--- tensorflow/compiler/xla/service/executable.h | 12 +++--- tensorflow/compiler/xla/service/service.cc | 41 ++++++++++--------- 3 files changed, 34 insertions(+), 30 deletions(-) diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index 1bd9d7b7228..427bdf878f0 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -196,15 +196,16 @@ StatusOr LocalExecutable::RunAsync( StatusOr LocalExecutable::ExecuteAndDump( const ServiceExecutableRunOptions* run_options, const absl::Span arguments) { - executable_->hlo_snapshot()->set_execution_platform( - backend_->platform()->Name()); - TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->hlo_snapshot())); + HloSnapshot snapshot; + *snapshot.mutable_hlo() = *executable_->hlo_proto(); + snapshot.set_execution_platform(backend_->platform()->Name()); + TF_RETURN_IF_ERROR(RecordArguments(arguments, &snapshot)); TF_ASSIGN_OR_RETURN( ScopedShapedBuffer result, executable_->ExecuteOnStream(run_options, arguments, /*hlo_execution_profile=*/nullptr)); - TF_RETURN_IF_ERROR(RecordResult(&result, executable_->hlo_snapshot())); - DumpHloSnapshotIfEnabled(executable_->module(), *executable_->hlo_snapshot()); + TF_RETURN_IF_ERROR(RecordResult(&result, &snapshot)); + DumpHloSnapshotIfEnabled(executable_->module(), snapshot); return std::move(result); } diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index 492ea72228d..78ee8757441 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -224,11 +224,11 @@ class Executable { virtual int64 SizeInBytes(); // Dumping helpers. - void set_hlo_snapshot(std::unique_ptr hlo_snapshot) { - hlo_snapshot_ = std::move(hlo_snapshot); + void set_hlo_proto(std::unique_ptr hlo_proto) { + hlo_proto_ = std::move(hlo_proto); } - bool dumping_snapshot() const { return hlo_snapshot_ != nullptr; } - HloSnapshot* hlo_snapshot() const { return hlo_snapshot_.get(); } + bool dumping_snapshot() const { return hlo_proto_ != nullptr; } + HloProto const* hlo_proto() const { return hlo_proto_.get(); } protected: mutable tensorflow::mutex mutex_; @@ -241,8 +241,8 @@ class Executable { // around. const std::shared_ptr hlo_module_; - // HloSnapshot this was compiled from. Null if not dumping executions. - std::unique_ptr hlo_snapshot_; + // The serialized HLO proto. Non-null only if dumping snapshots is enabled. + std::unique_ptr hlo_proto_; // Execution count, used to generate a unique filename for each dumped // execution. diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index 5ec45eb491a..9625fd011de 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -351,11 +351,11 @@ StatusOr>> Service::BuildExecutables( VLOG(1) << StrFormat("BuildExecutable on service %p", this); // Dump computation proto state if flag is set. - std::vector> hlo_snapshots; + std::vector> hlo_protos; for (int64 i = 0; i < module_protos.size(); ++i) { - auto hlo_snapshot = absl::make_unique(); - *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = *module_protos[i]; - hlo_snapshots.push_back(std::move(hlo_snapshot)); + auto hlo_proto = absl::make_unique(); + *hlo_proto->mutable_hlo_module() = *module_protos[i]; + hlo_protos.push_back(std::move(hlo_proto)); } VLOG(1) << "Computations:"; @@ -383,7 +383,7 @@ StatusOr>> Service::BuildExecutables( const auto& debug_opts = module_configs[i]->debug_options(); if (DumpingEnabledForHloModule(module_protos[i]->name(), debug_opts) && debug_opts.xla_dump_hlo_snapshots()) { - executables[i]->set_hlo_snapshot(std::move(hlo_snapshots[i])); + executables[i]->set_hlo_proto(std::move(hlo_protos[i])); } } @@ -692,14 +692,17 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg, executable_ptrs.push_back(executable.get()); } + std::vector snapshots; + snapshots.resize(executable_ptrs.size()); for (int i = 0; i < executable_ptrs.size(); i++) { if (executable_ptrs[i]->dumping_snapshot()) { + *snapshots[i].mutable_hlo() = *executable_ptrs[i]->hlo_proto(); TF_ASSIGN_OR_RETURN(auto stream, execute_backend_->BorrowStream( all_executors[i][0]->device_ordinal())); TF_RETURN_IF_ERROR(RecordArguments(all_arguments[i].front(), stream.get(), execute_backend_->transfer_manager(), - executable_ptrs[i]->hlo_snapshot())); + &snapshots[i])); } } @@ -746,9 +749,8 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg, execute_backend_->BorrowStream(all_executors[i][0])); TF_RETURN_IF_ERROR(RecordResult(*result_buffer, stream.get(), execute_backend_->transfer_manager(), - executable->hlo_snapshot())); - DumpHloSnapshotIfEnabled(executable->module(), - *executable->hlo_snapshot()); + &snapshots[i])); + DumpHloSnapshotIfEnabled(executable->module(), snapshots[i]); } } @@ -803,9 +805,9 @@ StatusOr> Service::BuildExecutable( const auto& debug_opts = module_config->debug_options(); if (DumpingEnabledForHloModule(module_proto.name(), debug_opts) && debug_opts.xla_dump_hlo_snapshots()) { - auto hlo_snapshot = absl::make_unique(); - *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = module_proto; - executable->set_hlo_snapshot(std::move(hlo_snapshot)); + auto hlo_proto = absl::make_unique(); + *hlo_proto->mutable_hlo_module() = module_proto; + executable->set_hlo_proto(std::move(hlo_proto)); } return std::move(executable); @@ -891,12 +893,13 @@ Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) { TF_ASSIGN_OR_RETURN(auto stream, execute_backend_->BorrowStream( execute_backend_->default_stream_executor())); + HloSnapshot snapshot; if (executable->dumping_snapshot()) { - executable->hlo_snapshot()->set_execution_platform( - execute_backend_->platform()->Name()); - TF_RETURN_IF_ERROR(RecordArguments( - replicated_arguments.front(), stream.get(), - execute_backend_->transfer_manager(), executable->hlo_snapshot())); + *snapshot.mutable_hlo() = *executable->hlo_proto(); + snapshot.set_execution_platform(execute_backend_->platform()->Name()); + TF_RETURN_IF_ERROR( + RecordArguments(replicated_arguments.front(), stream.get(), + execute_backend_->transfer_manager(), &snapshot)); } TF_ASSIGN_OR_RETURN( @@ -913,8 +916,8 @@ Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) { allocation_tracker_.ResolveForReplica(result->output(), 0)); TF_RETURN_IF_ERROR(RecordResult(*result_buffer, stream.get(), execute_backend_->transfer_manager(), - executable->hlo_snapshot())); - DumpHloSnapshotIfEnabled(executable->module(), *executable->hlo_snapshot()); + &snapshot)); + DumpHloSnapshotIfEnabled(executable->module(), snapshot); } VLOG(1) << "successfully completed 'execute' request"; From c91bac90dbc50500b2abab54237447468f3a5a4c Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 25 Jul 2019 06:21:36 -0700 Subject: [PATCH 0558/3053] Allow -1 for block dimensions in model_pruning library. A dimension of -1 means the block is the full size of the corresponding weight matrix in that dimension. PiperOrigin-RevId: 259934254 --- .../contrib/model_pruning/python/pruning.py | 10 +++++++-- .../model_pruning/python/pruning_test.py | 21 ++++++++++++++++++- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py index 388384a492f..30375c7f56e 100644 --- a/tensorflow/contrib/model_pruning/python/pruning.py +++ b/tensorflow/contrib/model_pruning/python/pruning.py @@ -172,9 +172,11 @@ def get_pruning_hparams(): nbins: integer number of bins to use for histogram computation block_height: integer - number of rows in a block (defaults to 1) + number of rows in a block (defaults to 1), can be -1 in which + case it is set to the size of the corresponding weight tensor. block_width: integer - number of cols in a block (defaults to 1) + number of cols in a block (defaults to 1), can be -1 in which + case it is set to the size of the corresponding weight tensor. block_pooling_function: string Whether to perform average (AVG) or max (MAX) pooling in the block (default: AVG) @@ -489,6 +491,10 @@ class Pruning(object): if squeezed_weights.get_shape().ndims != 2 or block_dims == [1, 1]: return self._update_mask(weights, threshold) + for i in range(2): + if block_dims[i] == -1: + block_dims[i] = squeezed_weights.get_shape()[i] + if self._block_pooling_function not in ['AVG', 'MAX']: raise ValueError('Unknown pooling function for block sparsity: %s' % self._block_pooling_function) diff --git a/tensorflow/contrib/model_pruning/python/pruning_test.py b/tensorflow/contrib/model_pruning/python/pruning_test.py index 58080ad050d..1a925caab96 100644 --- a/tensorflow/contrib/model_pruning/python/pruning_test.py +++ b/tensorflow/contrib/model_pruning/python/pruning_test.py @@ -129,7 +129,7 @@ class PruningTest(test.TestCase): mask_val = new_mask.eval() self.assertAllEqual(mask_val, expected_mask) - def testBlockMasking(self): + def testBlockMaskingWithNonnegativeBlockDimensions(self): param_list = ["block_height=2", "block_width=2", "threshold_decay=0"] weights_avg = constant_op.constant( @@ -146,6 +146,25 @@ class PruningTest(test.TestCase): self._blockMasking(param_list + ["block_pooling_function=AVG"], weights_avg, expected_mask) + def testBlockMaskingWithNegativeBlockDimensions(self): + param_list = ["block_height=1", "block_width=-1", "threshold_decay=0"] + + weights_avg = constant_op.constant([[0.1, 0.1, 0.1, 0.1], + [0.2, 0.2, 0.2, 0.2], + [0.3, 0.3, 0.3, 0.3], + [0.3, 0.3, 0.4, 0.4]]) + weights_max = constant_op.constant([[0.1, 0.0, 0.1, 0.0], + [0.0, 0.1, 0.0, 0.2], + [0.3, 0.0, 0.3, 0.0], + [0.0, -0.3, 0.0, 0.4]]) + expected_mask = [[0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], + [1., 1., 1., 1.], [1., 1., 1., 1.]] + + self._blockMasking(param_list + ["block_pooling_function=MAX"], weights_max, + expected_mask) + self._blockMasking(param_list + ["block_pooling_function=AVG"], weights_avg, + expected_mask) + def testBlockMaskingWithHigherDimensions(self): param_list = ["block_height=2", "block_width=2", "threshold_decay=0"] From 10c647ead41d2495fd005a50f355d55e2527889a Mon Sep 17 00:00:00 2001 From: Edward Loper Date: Thu, 25 Jul 2019 06:22:44 -0700 Subject: [PATCH 0559/3053] Update tf.dynamic_partition to handle RaggedTensor inputs. PiperOrigin-RevId: 259934387 --- .../python/ops/ragged/ragged_dispatch.py | 12 ++++++++++ .../python/ops/ragged/ragged_dispatch_test.py | 23 ++++++++++++++++--- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py index 0f67c8c6edc..b17bfc2fe9c 100644 --- a/tensorflow/python/ops/ragged/ragged_dispatch.py +++ b/tensorflow/python/ops/ragged/ragged_dispatch.py @@ -26,6 +26,7 @@ from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor from tensorflow.python.ops import array_ops from tensorflow.python.ops import clip_ops +from tensorflow.python.ops import data_flow_ops from tensorflow.python.ops import gen_bitwise_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import parsing_ops @@ -437,6 +438,15 @@ def _ragged_squeeze_v1(input, axis=None, name=None, squeeze_dims=None): # pylin squeeze_dims) return ragged_squeeze_op.squeeze(input, axis, name) + +def _ragged_dynamic_partition(data, partitions, num_partitions, name=None): + """RaggedTensor Dispatch override for tf.dynamic_partition.""" + if not isinstance(num_partitions, int) or num_partitions < 0: + raise TypeError('num_partitions must be a non-negative integer') + result = ragged_array_ops.stack_dynamic_partitions(data, partitions, + num_partitions, name) + return [result[i] for i in range(num_partitions)] + # (original_op, ragged_op, ragged_args) _RAGGED_DISPATCH_OPS = [ (array_ops.batch_gather, ragged_batch_gather_ops.batch_gather, @@ -457,6 +467,8 @@ _RAGGED_DISPATCH_OPS = [ (array_ops.stack, ragged_concat_ops.stack, ['[values]']), (array_ops.tile, ragged_array_ops.tile, ['input']), (array_ops.where, ragged_where_op.where, ['condition', 'x', 'y']), + (data_flow_ops.dynamic_partition, _ragged_dynamic_partition, + ['data', 'partitions']), (math_ops.unsorted_segment_sum, ragged_math_ops.segment_sum, ['data', 'segment_ids']), (math_ops.unsorted_segment_prod, ragged_math_ops.segment_prod, diff --git a/tensorflow/python/ops/ragged/ragged_dispatch_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py index 246a0255c72..c222ea5026a 100644 --- a/tensorflow/python/ops/ragged/ragged_dispatch_test.py +++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py @@ -29,6 +29,7 @@ from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import clip_ops +from tensorflow.python.ops import data_flow_ops from tensorflow.python.ops import gen_bitwise_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import parsing_ops @@ -728,11 +729,27 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase, 'axis': [0] }, expected=ragged_factory_ops.constant_value([[1, 2, 3], [4, 5]])), + dict( + op=data_flow_ops.dynamic_partition, + kwargs={ + 'data': ragged_factory_ops.constant_value([[1], [2, 3, 4], [5]]), + 'partitions': [2, 1, 1], + 'num_partitions': 3}, + expected=[ragged_factory_ops.constant_value([], ragged_rank=1), + ragged_factory_ops.constant_value([[2, 3, 4], [5]]), + ragged_factory_ops.constant_value([[1]])], + result_is_list=True), ]) - def testRaggedDispatch(self, op, expected, args=(), kwargs=None): + def testRaggedDispatch(self, op, expected, args=(), result_is_list=False, + kwargs=None): if kwargs is None: kwargs = {} result = op(*args, **kwargs) - self.assertAllEqual(result, expected) + if result_is_list: + self.assertLen(result, len(expected)) + for (r, e) in zip(result, expected): + self.assertAllEqual(r, e) + else: + self.assertAllEqual(result, expected) def test_ragged_op_list(self): # Ops that should be listed as supported in both v1 and v2. @@ -768,7 +785,7 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase, 'strings.substr', 'strings.to_hash_bucket_fast', 'strings.to_hash_bucket_strong', 'strings.to_hash_bucket', 'strings.to_number', 'strings.unicode_script', 'tile', 'truncatediv', - 'truncatemod', 'zeros_like' + 'truncatemod', 'zeros_like', 'dynamic_partition' ] # Ops that should be listed as supported in v1 only. From e0fb6774f3a359c7aa9183727e31afc587dadec5 Mon Sep 17 00:00:00 2001 From: captain-pool Date: Thu, 25 Jul 2019 19:24:58 +0530 Subject: [PATCH 0560/3053] Fixed Printing for Dictionaries --- tensorflow/python/tools/saved_model_cli.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py index fe751533584..6335383158d 100644 --- a/tensorflow/python/tools/saved_model_cli.py +++ b/tensorflow/python/tools/saved_model_cli.py @@ -200,7 +200,7 @@ def _print_args(arguments, argument_type="Argument", indent=0): def _may_be_add_quotes(value): is_quotes = '\'' * isinstance(value, str) - return is_quotes + value + is_quotes + return is_quotes + str(value) + is_quotes def in_print(s, end='\n'): print(indent_str + s, end=end) @@ -221,14 +221,14 @@ def _print_args(arguments, argument_type="Argument", indent=0): elif isinstance(element, dict): in_print(' DType: %s' % type(element).__name__) in_print(' Values: {', end='') - for key, value in element.items(): + for (key, value) in element.items(): if is_nested(element): - in_print(' \'%s\': [' % str(key), end='') + in_print('\n \'%s\': [' % str(key), end='') _print_args(element, indent + 1) in_print(' ]') else: - in_print(' \'%s\': %s' % (str(key), _may_be_add_quotes(value)), end='') - in_print(' }') + print('\'%s\': %s' % (str(key), _may_be_add_quotes(value)), end=', ') + print('\b\b}') else: in_print(' DType: %s' % type(element).__name__) in_print(' Value: %s' % str(element)) From 53da0bc5ceda825873864aeba3a59ef171924ba4 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 25 Jul 2019 06:52:20 -0700 Subject: [PATCH 0561/3053] Automated rollback of commit e8510ab01da8a9f9ac7691c16cb640a7bfd45526 PiperOrigin-RevId: 259937937 --- .../compiler/xla/client/local_client.cc | 11 +++-- tensorflow/compiler/xla/service/executable.h | 12 +++--- tensorflow/compiler/xla/service/service.cc | 41 +++++++++---------- 3 files changed, 30 insertions(+), 34 deletions(-) diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index 427bdf878f0..1bd9d7b7228 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -196,16 +196,15 @@ StatusOr LocalExecutable::RunAsync( StatusOr LocalExecutable::ExecuteAndDump( const ServiceExecutableRunOptions* run_options, const absl::Span arguments) { - HloSnapshot snapshot; - *snapshot.mutable_hlo() = *executable_->hlo_proto(); - snapshot.set_execution_platform(backend_->platform()->Name()); - TF_RETURN_IF_ERROR(RecordArguments(arguments, &snapshot)); + executable_->hlo_snapshot()->set_execution_platform( + backend_->platform()->Name()); + TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->hlo_snapshot())); TF_ASSIGN_OR_RETURN( ScopedShapedBuffer result, executable_->ExecuteOnStream(run_options, arguments, /*hlo_execution_profile=*/nullptr)); - TF_RETURN_IF_ERROR(RecordResult(&result, &snapshot)); - DumpHloSnapshotIfEnabled(executable_->module(), snapshot); + TF_RETURN_IF_ERROR(RecordResult(&result, executable_->hlo_snapshot())); + DumpHloSnapshotIfEnabled(executable_->module(), *executable_->hlo_snapshot()); return std::move(result); } diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index 78ee8757441..492ea72228d 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -224,11 +224,11 @@ class Executable { virtual int64 SizeInBytes(); // Dumping helpers. - void set_hlo_proto(std::unique_ptr hlo_proto) { - hlo_proto_ = std::move(hlo_proto); + void set_hlo_snapshot(std::unique_ptr hlo_snapshot) { + hlo_snapshot_ = std::move(hlo_snapshot); } - bool dumping_snapshot() const { return hlo_proto_ != nullptr; } - HloProto const* hlo_proto() const { return hlo_proto_.get(); } + bool dumping_snapshot() const { return hlo_snapshot_ != nullptr; } + HloSnapshot* hlo_snapshot() const { return hlo_snapshot_.get(); } protected: mutable tensorflow::mutex mutex_; @@ -241,8 +241,8 @@ class Executable { // around. const std::shared_ptr hlo_module_; - // The serialized HLO proto. Non-null only if dumping snapshots is enabled. - std::unique_ptr hlo_proto_; + // HloSnapshot this was compiled from. Null if not dumping executions. + std::unique_ptr hlo_snapshot_; // Execution count, used to generate a unique filename for each dumped // execution. diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index 9625fd011de..5ec45eb491a 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -351,11 +351,11 @@ StatusOr>> Service::BuildExecutables( VLOG(1) << StrFormat("BuildExecutable on service %p", this); // Dump computation proto state if flag is set. - std::vector> hlo_protos; + std::vector> hlo_snapshots; for (int64 i = 0; i < module_protos.size(); ++i) { - auto hlo_proto = absl::make_unique(); - *hlo_proto->mutable_hlo_module() = *module_protos[i]; - hlo_protos.push_back(std::move(hlo_proto)); + auto hlo_snapshot = absl::make_unique(); + *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = *module_protos[i]; + hlo_snapshots.push_back(std::move(hlo_snapshot)); } VLOG(1) << "Computations:"; @@ -383,7 +383,7 @@ StatusOr>> Service::BuildExecutables( const auto& debug_opts = module_configs[i]->debug_options(); if (DumpingEnabledForHloModule(module_protos[i]->name(), debug_opts) && debug_opts.xla_dump_hlo_snapshots()) { - executables[i]->set_hlo_proto(std::move(hlo_protos[i])); + executables[i]->set_hlo_snapshot(std::move(hlo_snapshots[i])); } } @@ -692,17 +692,14 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg, executable_ptrs.push_back(executable.get()); } - std::vector snapshots; - snapshots.resize(executable_ptrs.size()); for (int i = 0; i < executable_ptrs.size(); i++) { if (executable_ptrs[i]->dumping_snapshot()) { - *snapshots[i].mutable_hlo() = *executable_ptrs[i]->hlo_proto(); TF_ASSIGN_OR_RETURN(auto stream, execute_backend_->BorrowStream( all_executors[i][0]->device_ordinal())); TF_RETURN_IF_ERROR(RecordArguments(all_arguments[i].front(), stream.get(), execute_backend_->transfer_manager(), - &snapshots[i])); + executable_ptrs[i]->hlo_snapshot())); } } @@ -749,8 +746,9 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg, execute_backend_->BorrowStream(all_executors[i][0])); TF_RETURN_IF_ERROR(RecordResult(*result_buffer, stream.get(), execute_backend_->transfer_manager(), - &snapshots[i])); - DumpHloSnapshotIfEnabled(executable->module(), snapshots[i]); + executable->hlo_snapshot())); + DumpHloSnapshotIfEnabled(executable->module(), + *executable->hlo_snapshot()); } } @@ -805,9 +803,9 @@ StatusOr> Service::BuildExecutable( const auto& debug_opts = module_config->debug_options(); if (DumpingEnabledForHloModule(module_proto.name(), debug_opts) && debug_opts.xla_dump_hlo_snapshots()) { - auto hlo_proto = absl::make_unique(); - *hlo_proto->mutable_hlo_module() = module_proto; - executable->set_hlo_proto(std::move(hlo_proto)); + auto hlo_snapshot = absl::make_unique(); + *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = module_proto; + executable->set_hlo_snapshot(std::move(hlo_snapshot)); } return std::move(executable); @@ -893,13 +891,12 @@ Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) { TF_ASSIGN_OR_RETURN(auto stream, execute_backend_->BorrowStream( execute_backend_->default_stream_executor())); - HloSnapshot snapshot; if (executable->dumping_snapshot()) { - *snapshot.mutable_hlo() = *executable->hlo_proto(); - snapshot.set_execution_platform(execute_backend_->platform()->Name()); - TF_RETURN_IF_ERROR( - RecordArguments(replicated_arguments.front(), stream.get(), - execute_backend_->transfer_manager(), &snapshot)); + executable->hlo_snapshot()->set_execution_platform( + execute_backend_->platform()->Name()); + TF_RETURN_IF_ERROR(RecordArguments( + replicated_arguments.front(), stream.get(), + execute_backend_->transfer_manager(), executable->hlo_snapshot())); } TF_ASSIGN_OR_RETURN( @@ -916,8 +913,8 @@ Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) { allocation_tracker_.ResolveForReplica(result->output(), 0)); TF_RETURN_IF_ERROR(RecordResult(*result_buffer, stream.get(), execute_backend_->transfer_manager(), - &snapshot)); - DumpHloSnapshotIfEnabled(executable->module(), snapshot); + executable->hlo_snapshot())); + DumpHloSnapshotIfEnabled(executable->module(), *executable->hlo_snapshot()); } VLOG(1) << "successfully completed 'execute' request"; From 5241b3d7e79420147aed895cb29b88d294deb008 Mon Sep 17 00:00:00 2001 From: Stephen McGroarty Date: Thu, 25 Jul 2019 15:11:06 +0100 Subject: [PATCH 0562/3053] Added comment expaining sharding property of ReplaceInst --- tensorflow/compiler/xla/service/hlo_computation.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h index 111b28a8610..28f87d51729 100644 --- a/tensorflow/compiler/xla/service/hlo_computation.h +++ b/tensorflow/compiler/xla/service/hlo_computation.h @@ -314,6 +314,8 @@ class HloComputation { // Replace old instruction with new instruction. Updates uses and root // instruction. Removes old instruction from computation. Precondition: // old_instruction and new_instruction must have the compatible shapes. + // If |new_instruction| doesn't have any sharding information it will + // recieve the sharding information of |old_instruction|. Status ReplaceInstruction(HloInstruction* old_instruction, HloInstruction* new_instruction); From 130a84e59cdb460d6d6c21475302f649b4c16170 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 25 Jul 2019 07:20:43 -0700 Subject: [PATCH 0563/3053] Remove superfluous Dequantize nodes in GPU delegate when executing float16 quantized models. PiperOrigin-RevId: 259941556 --- tensorflow/lite/delegates/gpu/common/BUILD | 1 + .../delegates/gpu/common/model_builder.cc | 147 ++++++++++++--- .../gpu/common/model_builder_test.cc | 177 +++++++++++++++++- 3 files changed, 300 insertions(+), 25 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD index fe5f5ed89cb..cd31e45e0c5 100644 --- a/tensorflow/lite/delegates/gpu/common/BUILD +++ b/tensorflow/lite/delegates/gpu/common/BUILD @@ -77,6 +77,7 @@ cc_library( ":tensor", "//tensorflow/lite:context", "//tensorflow/lite:kernel_api", + "//tensorflow/lite:util", "//tensorflow/lite/c:c_api_internal", "//tensorflow/lite/kernels:kernel_util", "//tensorflow/lite/schema:schema_fbs", diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc index 159eec57885..e074023f7c7 100644 --- a/tensorflow/lite/delegates/gpu/common/model_builder.cc +++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc @@ -43,6 +43,7 @@ limitations under the License. #include "tensorflow/lite/delegates/gpu/common/tensor.h" #include "tensorflow/lite/kernels/kernel_util.h" #include "tensorflow/lite/schema/schema_generated.h" +#include "tensorflow/lite/util.h" namespace tflite { namespace gpu { @@ -708,7 +709,6 @@ class AddOperationParser : public TFLiteOperationParser { } } node->operation.attributes = std::move(attr); - const auto* tf_options = reinterpret_cast(tflite_node->builtin_data); if (!tf_options) { @@ -2226,6 +2226,106 @@ Status GetNodeAndRegistration(TfLiteContext* context, int node_id, return OkStatus(); } +TfLiteIntArray* GetOpsToReplaceFromGraphWithDequantize(TfLiteContext* context) { + TfLiteIntArray* execution_plan = nullptr; + if (context->GetExecutionPlan(context, &execution_plan) != kTfLiteOk) { + context->ReportError(context, "Unable to get graph execution plan."); + return nullptr; + } + std::set errors; + std::unordered_map dequant_nodes; + std::vector ops_to_replace; + std::vector dequant_nodes_to_save; + + // Map the output tensor of a Dequantize nodes to its input tensor. + std::unordered_map node_map; + for (int i = 0; i < execution_plan->size; ++i) { + bool replace_node = false; + // Keep track of any inputs from a Dequantize node. + std::vector inputs_from_dequant; + std::vector orig_inputs; + + TfLiteNode* node = nullptr; + TfLiteRegistration* registration = nullptr; + auto status = GetNodeAndRegistration(context, i, &node, ®istration); + if (!status.ok()) { + context->ReportError(context, status.error_message().c_str()); + return nullptr; + } + if (registration->builtin_code == kTfLiteBuiltinDequantize && + context->tensors[node->inputs->data[0]].type == + TfLiteType::kTfLiteFloat16) { + // Record the output->input mapping for the op. + node_map[node->outputs->data[0]] = node->inputs->data[0]; + // For now, add the node to the list of ops to replace. + ops_to_replace.push_back(i); + // Record the dequant node id, indexed by output id. + dequant_nodes[node->outputs->data[0]] = i; + continue; + } + TfLiteIntArray* inputs = node->inputs; + // Fix the node's inputs (i.e. prune out the preceding dequantize node) + // in order to test if it is supported on the GPU. + for (int j = 0; j < inputs->size; ++j) { + orig_inputs.push_back(inputs->data[j]); + if (node_map.find(inputs->data[j]) != node_map.end()) { + inputs_from_dequant.push_back(dequant_nodes[inputs->data[j]]); + // Remap inputs of this node to the inputs of the preceding dequant. + inputs->data[j] = node_map[inputs->data[j]]; + } + } + status = IsSupported(context, node, registration); + if (status.ok() && + // TODO(eignasheva): resolve sub operation support for metal delegate + // registration->builtin_code != kTfLiteBuiltinSub && + IsAllFloatTensors(context, node->inputs) && + IsAllFloatTensors(context, node->outputs)) { + if (errors.empty()) { + replace_node = true; + ops_to_replace.push_back(i); + } + } else { + // Unable to replace this node. Restore the inputs to the original + // if they were modified. + if (!inputs_from_dequant.empty()) { + TfLiteIntArray* inputs = node->inputs; + for (int j = 0; j < inputs->size; ++j) { + inputs->data[j] = orig_inputs[j]; + } + } + errors.insert(GetOpNameByRegistration(registration) + ": " + + status.error_message()); + } + // if any input is the output of a dequantize node AND we failed to + // replace this op, mark the corresponding dequantize node as a node to + // save. + if (!replace_node && !inputs_from_dequant.empty()) { + dequant_nodes_to_save.insert(dequant_nodes_to_save.end(), + inputs_from_dequant.begin(), + inputs_from_dequant.end()); + } + } + if (!errors.empty()) { + std::string unsupported = absl::StrJoin(errors, "\n"); + std::string error_message = + "Next operations are not supported by GPU delegate:\n" + unsupported + + "\nFirst " + std::to_string(ops_to_replace.size()) + + " operations will run on the GPU, and the remaining " + + std::to_string(execution_plan->size - ops_to_replace.size()) + + " on the CPU."; + context->ReportError(context, error_message.c_str()); + } + // Pop all dequantize nodes that must be preserved. + for (int i = 0; i < dequant_nodes_to_save.size(); ++i) { + auto it = std::find(ops_to_replace.begin(), ops_to_replace.end(), + dequant_nodes_to_save[i]); + if (it != ops_to_replace.end()) { + ops_to_replace.erase(it); + } + } + return ConvertVectorToTfLiteIntArray(ops_to_replace); +} + // TODO(impjdi): Check number of input/output tensors and their dimensions. // TODO(impjdi): Check ops' parameters. TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) { @@ -2234,27 +2334,34 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) { context->ReportError(context, "Unable to get graph execution plan."); return nullptr; } - TfLiteIntArray* subgraph = TfLiteIntArrayCreate(execution_plan->size); - subgraph->size = 0; - std::set errors; - // Map the output tensor of a Dequantize nodes to its input tensor. - std::unordered_map node_map; + // Dispatch to another function if graph has Dequantize nodes. for (int i = 0; i < execution_plan->size; ++i) { TfLiteNode* node = nullptr; TfLiteRegistration* registration = nullptr; auto status = GetNodeAndRegistration(context, i, &node, ®istration); if (!status.ok()) { context->ReportError(context, status.error_message().c_str()); - TfLiteIntArrayFree(subgraph); return nullptr; } if (registration->builtin_code == kTfLiteBuiltinDequantize && context->tensors[node->inputs->data[0]].type == TfLiteType::kTfLiteFloat16) { - // Record the output->input mapping for the op. - node_map[node->outputs->data[0]] = node->inputs->data[0]; - continue; + return GetOpsToReplaceFromGraphWithDequantize(context); + } + } + + // No Dequantize nodes. Iterate through graph and find ops to replace. + TfLiteIntArray* subgraph = TfLiteIntArrayCreate(execution_plan->size); + subgraph->size = 0; + std::set errors; + for (int i = 0; i < execution_plan->size; ++i) { + TfLiteNode* node = nullptr; + TfLiteRegistration* registration = nullptr; + auto status = GetNodeAndRegistration(context, i, &node, ®istration); + if (!status.ok()) { + context->ReportError(context, status.error_message().c_str()); + return nullptr; } status = IsSupported(context, node, registration); if (status.ok() && @@ -2262,14 +2369,6 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) { // registration->builtin_code != kTfLiteBuiltinSub && IsAllFloatTensors(context, node->inputs) && IsAllFloatTensors(context, node->outputs)) { - // Fix the node's inputs (i.e. prune out the preceding dequantize node) - // if the op is supported. - TfLiteIntArray* inputs = node->inputs; - for (int j = 0; j < inputs->size; ++j) { - if (node_map.find(inputs->data[j]) != node_map.end()) { - inputs->data[j] = node_map[inputs->data[j]]; - } - } if (errors.empty()) subgraph->data[subgraph->size++] = i; } else { errors.insert(GetOpNameByRegistration(registration) + ": " + @@ -2292,12 +2391,17 @@ Status BuildModel(TfLiteContext* context, const TfLiteDelegateParams* delegate_params, GraphFloat32* graph) { std::vector> operations; + std::vector tflite_nodes; for (int i = 0; i < delegate_params->nodes_to_replace->size; ++i) { TfLiteNode* tflite_node = nullptr; TfLiteRegistration* registration = nullptr; RETURN_IF_ERROR(GetNodeAndRegistration( context, delegate_params->nodes_to_replace->data[i], &tflite_node, ®istration)); + if (registration->builtin_code == kTfLiteBuiltinDequantize) { + // Ignore Dequantize nodes. + continue; + } auto op_parser = NewOperationParser(registration); if (!op_parser) { return UnimplementedError( @@ -2306,15 +2410,16 @@ Status BuildModel(TfLiteContext* context, ") is not supported by TFLite GPU Delegate.")); } operations.push_back(std::move(op_parser)); + tflite_nodes.push_back(i); } std::vector>*> tensor_to_value(context->tensors_size, nullptr); - for (int i = 0; i < delegate_params->nodes_to_replace->size; ++i) { + for (int i = 0; i < operations.size(); ++i) { TfLiteNode* tflite_node = nullptr; TfLiteRegistration* registration = nullptr; RETURN_IF_ERROR(GetNodeAndRegistration( - context, delegate_params->nodes_to_replace->data[i], &tflite_node, - ®istration)); + context, delegate_params->nodes_to_replace->data[tflite_nodes[i]], + &tflite_node, ®istration)); ObjectReader reader(graph, context, tflite_node, &tensor_to_value); RETURN_IF_ERROR( operations[i]->Parse(tflite_node, registration, graph, &reader)); diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc index 31c7c570867..f737612856d 100644 --- a/tensorflow/lite/delegates/gpu/common/model_builder_test.cc +++ b/tensorflow/lite/delegates/gpu/common/model_builder_test.cc @@ -212,7 +212,8 @@ TEST(ModelBuilderTest, GetOpsToReplacePrunesFp16DequantizeNodes) { // t0 (FP16) -> DequantNode -> t1 (FP32) -> Add -> t4 // t2 (FP16) -> DequantNode -> t3 (FP32) --/ // - // After pruning, the graph has one node: + // OpsToReplace should choose all three nodes for replacement, and + // the graph on the GPU will look like this (no Dequants): // // t0 (FP16) --> Add -> t4 // t2 (FP16) --/ @@ -237,11 +238,11 @@ TEST(ModelBuilderTest, GetOpsToReplacePrunesFp16DequantizeNodes) { TfLiteIntArray* ops_to_replace = GetOpsToReplace(context); - // Just one node left. - EXPECT_EQ(ops_to_replace->size, 1); + // Replace all nodes. + EXPECT_EQ(ops_to_replace->size, 3); TfLiteNode* node = nullptr; TfLiteRegistration* registration = nullptr; - context->GetNodeAndRegistration(context, ops_to_replace->data[0], &node, + context->GetNodeAndRegistration(context, ops_to_replace->data[2], &node, ®istration); EXPECT_EQ(context->tensors[node->inputs->data[0]].type, TfLiteType::kTfLiteFloat16); @@ -416,6 +417,174 @@ TEST(ModelBuilderTest, GetOpsToReplaceDoesNotPruneUint8) { TfLiteIntArrayFree(ops_to_replace); } +class InterpreterMultiNode { + public: + InterpreterMultiNode() { + void* builtin_data = malloc(sizeof(int)); + EXPECT_EQ(interpreter_.AddTensors(8), kTfLiteOk); + EXPECT_EQ(interpreter_.SetInputs({0, 1, 2}), kTfLiteOk); + EXPECT_EQ(interpreter_.SetOutputs({6, 7}), kTfLiteOk); + + // Add 3 Dequantize Nodes with float16 input. + for (int i = 0; i < 3; ++i) { + const TfLiteRegistration reg_dequant = {/*init=*/nullptr, + /*free=*/nullptr, + /*prepare=*/nullptr, + /*invoke=*/nullptr, + /*profiling_string=*/nullptr, + kTfLiteBuiltinDequantize}; + EXPECT_EQ(interpreter_.AddNodeWithParameters( + /*inputs=*/{i}, /*outputs=*/{i + 3}, /*init_data=*/nullptr, + /*init_data_size=*/0, /*builtin_data=*/nullptr, + /*registration=*/®_dequant), + kTfLiteOk); + } + + // Add the ADD op node that GPU delegate supports. + const TfLiteRegistration reg_add0 = { + [](TfLiteContext* context, const char* buffer, size_t length) { + return reinterpret_cast(new int(1)); + }, + [](TfLiteContext* context, void* buffer) { + delete reinterpret_cast(buffer); + }, + nullptr, + nullptr, + nullptr, + kTfLiteBuiltinAdd}; + + EXPECT_EQ(interpreter_.AddNodeWithParameters( + /*inputs=*/{4, 5}, /*outputs=*/{7}, /*init_data=*/nullptr, + /*init_data_size=*/0, + /*builtin_data=*/builtin_data, + /*registration=*/®_add0), + kTfLiteOk); + + // Add the GreaterThan op node that GPU delegate doesn't support. + const TfLiteRegistration reg_greater = { + [](TfLiteContext* context, const char* buffer, size_t length) { + return reinterpret_cast(new int(1)); + }, + [](TfLiteContext* context, void* buffer) { + delete reinterpret_cast(buffer); + }, + nullptr, + nullptr, + nullptr, + kTfLiteBuiltinGreater}; + + EXPECT_EQ(interpreter_.AddNodeWithParameters( + /*inputs=*/{3, 4}, /*outputs=*/{6}, /*init_data=*/nullptr, + /*init_data_size=*/0, + /*builtin_data=*/builtin_data, + /*registration=*/®_greater), + kTfLiteOk); + + const std::vector dims = {1}; + TfLiteQuantization quantization; + quantization.type = kTfLiteNoQuantization; + EXPECT_EQ( + interpreter_.SetTensorParametersReadWrite( + 0, TfLiteType::kTfLiteFloat16, "t0", dims, quantization, false), + kTfLiteOk); + EXPECT_EQ( + interpreter_.SetTensorParametersReadWrite( + 1, TfLiteType::kTfLiteFloat16, "t1", dims, quantization, false), + kTfLiteOk); + EXPECT_EQ( + interpreter_.SetTensorParametersReadWrite( + 2, TfLiteType::kTfLiteFloat16, "t2", dims, quantization, false), + kTfLiteOk); + EXPECT_EQ( + interpreter_.SetTensorParametersReadWrite( + 3, TfLiteType::kTfLiteFloat32, "t3", dims, quantization, false), + kTfLiteOk); + EXPECT_EQ( + interpreter_.SetTensorParametersReadWrite( + 4, TfLiteType::kTfLiteFloat32, "t4", dims, quantization, false), + kTfLiteOk); + EXPECT_EQ( + interpreter_.SetTensorParametersReadWrite( + 5, TfLiteType::kTfLiteFloat32, "t5", dims, quantization, false), + kTfLiteOk); + EXPECT_EQ( + interpreter_.SetTensorParametersReadWrite( + 6, TfLiteType::kTfLiteFloat32, "t5", dims, quantization, false), + kTfLiteOk); + EXPECT_EQ( + interpreter_.SetTensorParametersReadWrite( + 7, TfLiteType::kTfLiteFloat32, "t5", dims, quantization, false), + kTfLiteOk); + exec_plan_ = TfLiteIntArrayCreate(5); + exec_plan_->data[0] = 0; + exec_plan_->data[1] = 1; + exec_plan_->data[2] = 2; + exec_plan_->data[3] = 3; + exec_plan_->data[4] = 4; + } + + ~InterpreterMultiNode() { TfLiteIntArrayFree(exec_plan_); } + + Subgraph* GetSubgraph() { return interpreter_.subgraph(0); } + TfLiteIntArray* exec_plan() const { return exec_plan_; } + + private: + Interpreter interpreter_; + TfLiteIntArray* exec_plan_; +}; + +InterpreterMultiNode* interpreter_mn = new InterpreterMultiNode(); + +TEST(ModelBuilderTest, GetOpsToReplaceSelectsCorrectDequants) { + // A graph with three Dequant nodes feeding two ops, 'Add' and 'Greater'. + // 'Add' can be replaced by the GPU delegate, but 'Greater' can not. + // t0 (FP16) --> Dequant --> t3 (FP32) --> Greater -> t6 + // t1 (FP16) --> Dequant --> t4 (FP32) --/ + // --\ + // t3 (FP16) --> Dequant --> t5 (FP32) --> Add -> t7 + // + // OpsToReplace should replace the 'Add' op and the Dequant outputing + // t5, but leave the other Dequant nodes because 'Greater' must run + // on the CPU. + TfLiteContext* context = interpreter_mn->GetSubgraph()->context(); + + // These functions are meant to be called inside delegates. Swap out + // for similar functions to permit direct calling of GetOpsToReplace. + context->GetExecutionPlan = [](struct TfLiteContext* context, + TfLiteIntArray** execution_plan) { + *execution_plan = interpreter_mn->exec_plan(); + return kTfLiteOk; + }; + context->GetNodeAndRegistration = [](struct TfLiteContext*, int node_index, + TfLiteNode** node, + TfLiteRegistration** registration) { + auto& node_and_reg = + interpreter_mn->GetSubgraph()->nodes_and_registration()[node_index]; + *node = &node_and_reg.first; + *registration = &node_and_reg.second; + return kTfLiteOk; + }; + + TfLiteIntArray* ops_to_replace = GetOpsToReplace(context); + + EXPECT_EQ(ops_to_replace->size, 2); + // Op at index 2 is the Dequant op (t3 -> t5). + EXPECT_EQ(ops_to_replace->data[0], 2); + // Op at index 3 is the Add op. + EXPECT_EQ(ops_to_replace->data[1], 3); + + TfLiteNode* node = nullptr; + TfLiteRegistration* registration = nullptr; + // Verify that Add op has fp16 inputs. + context->GetNodeAndRegistration(context, ops_to_replace->data[1], &node, + ®istration); + EXPECT_EQ(context->tensors[node->inputs->data[0]].type, + TfLiteType::kTfLiteFloat16); + EXPECT_EQ(context->tensors[node->inputs->data[1]].type, + TfLiteType::kTfLiteFloat16); + TfLiteIntArrayFree(ops_to_replace); +} + } // namespace } // namespace gpu } // namespace tflite From 3f266b1c8d18518c6c722fade13b45049c507261 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Thu, 25 Jul 2019 07:52:48 -0700 Subject: [PATCH 0564/3053] Automated rollback of commit 53da0bc5ceda825873864aeba3a59ef171924ba4 PiperOrigin-RevId: 259945774 --- .../compiler/xla/client/local_client.cc | 11 ++--- tensorflow/compiler/xla/service/executable.h | 12 +++--- tensorflow/compiler/xla/service/service.cc | 41 ++++++++++--------- 3 files changed, 34 insertions(+), 30 deletions(-) diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index 1bd9d7b7228..427bdf878f0 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -196,15 +196,16 @@ StatusOr LocalExecutable::RunAsync( StatusOr LocalExecutable::ExecuteAndDump( const ServiceExecutableRunOptions* run_options, const absl::Span arguments) { - executable_->hlo_snapshot()->set_execution_platform( - backend_->platform()->Name()); - TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->hlo_snapshot())); + HloSnapshot snapshot; + *snapshot.mutable_hlo() = *executable_->hlo_proto(); + snapshot.set_execution_platform(backend_->platform()->Name()); + TF_RETURN_IF_ERROR(RecordArguments(arguments, &snapshot)); TF_ASSIGN_OR_RETURN( ScopedShapedBuffer result, executable_->ExecuteOnStream(run_options, arguments, /*hlo_execution_profile=*/nullptr)); - TF_RETURN_IF_ERROR(RecordResult(&result, executable_->hlo_snapshot())); - DumpHloSnapshotIfEnabled(executable_->module(), *executable_->hlo_snapshot()); + TF_RETURN_IF_ERROR(RecordResult(&result, &snapshot)); + DumpHloSnapshotIfEnabled(executable_->module(), snapshot); return std::move(result); } diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index 492ea72228d..78ee8757441 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -224,11 +224,11 @@ class Executable { virtual int64 SizeInBytes(); // Dumping helpers. - void set_hlo_snapshot(std::unique_ptr hlo_snapshot) { - hlo_snapshot_ = std::move(hlo_snapshot); + void set_hlo_proto(std::unique_ptr hlo_proto) { + hlo_proto_ = std::move(hlo_proto); } - bool dumping_snapshot() const { return hlo_snapshot_ != nullptr; } - HloSnapshot* hlo_snapshot() const { return hlo_snapshot_.get(); } + bool dumping_snapshot() const { return hlo_proto_ != nullptr; } + HloProto const* hlo_proto() const { return hlo_proto_.get(); } protected: mutable tensorflow::mutex mutex_; @@ -241,8 +241,8 @@ class Executable { // around. const std::shared_ptr hlo_module_; - // HloSnapshot this was compiled from. Null if not dumping executions. - std::unique_ptr hlo_snapshot_; + // The serialized HLO proto. Non-null only if dumping snapshots is enabled. + std::unique_ptr hlo_proto_; // Execution count, used to generate a unique filename for each dumped // execution. diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index 5ec45eb491a..9625fd011de 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -351,11 +351,11 @@ StatusOr>> Service::BuildExecutables( VLOG(1) << StrFormat("BuildExecutable on service %p", this); // Dump computation proto state if flag is set. - std::vector> hlo_snapshots; + std::vector> hlo_protos; for (int64 i = 0; i < module_protos.size(); ++i) { - auto hlo_snapshot = absl::make_unique(); - *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = *module_protos[i]; - hlo_snapshots.push_back(std::move(hlo_snapshot)); + auto hlo_proto = absl::make_unique(); + *hlo_proto->mutable_hlo_module() = *module_protos[i]; + hlo_protos.push_back(std::move(hlo_proto)); } VLOG(1) << "Computations:"; @@ -383,7 +383,7 @@ StatusOr>> Service::BuildExecutables( const auto& debug_opts = module_configs[i]->debug_options(); if (DumpingEnabledForHloModule(module_protos[i]->name(), debug_opts) && debug_opts.xla_dump_hlo_snapshots()) { - executables[i]->set_hlo_snapshot(std::move(hlo_snapshots[i])); + executables[i]->set_hlo_proto(std::move(hlo_protos[i])); } } @@ -692,14 +692,17 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg, executable_ptrs.push_back(executable.get()); } + std::vector snapshots; + snapshots.resize(executable_ptrs.size()); for (int i = 0; i < executable_ptrs.size(); i++) { if (executable_ptrs[i]->dumping_snapshot()) { + *snapshots[i].mutable_hlo() = *executable_ptrs[i]->hlo_proto(); TF_ASSIGN_OR_RETURN(auto stream, execute_backend_->BorrowStream( all_executors[i][0]->device_ordinal())); TF_RETURN_IF_ERROR(RecordArguments(all_arguments[i].front(), stream.get(), execute_backend_->transfer_manager(), - executable_ptrs[i]->hlo_snapshot())); + &snapshots[i])); } } @@ -746,9 +749,8 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg, execute_backend_->BorrowStream(all_executors[i][0])); TF_RETURN_IF_ERROR(RecordResult(*result_buffer, stream.get(), execute_backend_->transfer_manager(), - executable->hlo_snapshot())); - DumpHloSnapshotIfEnabled(executable->module(), - *executable->hlo_snapshot()); + &snapshots[i])); + DumpHloSnapshotIfEnabled(executable->module(), snapshots[i]); } } @@ -803,9 +805,9 @@ StatusOr> Service::BuildExecutable( const auto& debug_opts = module_config->debug_options(); if (DumpingEnabledForHloModule(module_proto.name(), debug_opts) && debug_opts.xla_dump_hlo_snapshots()) { - auto hlo_snapshot = absl::make_unique(); - *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = module_proto; - executable->set_hlo_snapshot(std::move(hlo_snapshot)); + auto hlo_proto = absl::make_unique(); + *hlo_proto->mutable_hlo_module() = module_proto; + executable->set_hlo_proto(std::move(hlo_proto)); } return std::move(executable); @@ -891,12 +893,13 @@ Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) { TF_ASSIGN_OR_RETURN(auto stream, execute_backend_->BorrowStream( execute_backend_->default_stream_executor())); + HloSnapshot snapshot; if (executable->dumping_snapshot()) { - executable->hlo_snapshot()->set_execution_platform( - execute_backend_->platform()->Name()); - TF_RETURN_IF_ERROR(RecordArguments( - replicated_arguments.front(), stream.get(), - execute_backend_->transfer_manager(), executable->hlo_snapshot())); + *snapshot.mutable_hlo() = *executable->hlo_proto(); + snapshot.set_execution_platform(execute_backend_->platform()->Name()); + TF_RETURN_IF_ERROR( + RecordArguments(replicated_arguments.front(), stream.get(), + execute_backend_->transfer_manager(), &snapshot)); } TF_ASSIGN_OR_RETURN( @@ -913,8 +916,8 @@ Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) { allocation_tracker_.ResolveForReplica(result->output(), 0)); TF_RETURN_IF_ERROR(RecordResult(*result_buffer, stream.get(), execute_backend_->transfer_manager(), - executable->hlo_snapshot())); - DumpHloSnapshotIfEnabled(executable->module(), *executable->hlo_snapshot()); + &snapshot)); + DumpHloSnapshotIfEnabled(executable->module(), snapshot); } VLOG(1) << "successfully completed 'execute' request"; From 53b1cf9a39484900da642767896871125d7bff84 Mon Sep 17 00:00:00 2001 From: Guangda Lai <31743510+aaroey@users.noreply.github.com> Date: Thu, 25 Jul 2019 09:00:05 -0700 Subject: [PATCH 0565/3053] Fix format --- tensorflow/python/compiler/tensorrt/trt_convert.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py index b9b6ca91587..8ffb6a9793e 100644 --- a/tensorflow/python/compiler/tensorrt/trt_convert.py +++ b/tensorflow/python/compiler/tensorrt/trt_convert.py @@ -94,7 +94,9 @@ class TrtPrecisionMode(object): @staticmethod def supported_precision_modes(): - precisions = [TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8] + precisions = [ + TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8 + ] return precisions + [p.lower() for p in precisions] # Use a large enough number as the default max_workspace_size for TRT engines, From 18f1467496b4529a0a60ff3f67f8e57e0d103d1f Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Thu, 25 Jul 2019 09:00:53 -0700 Subject: [PATCH 0566/3053] [XLA] Make HLO snapshot dumping work on the LocalClient::RunAsync path. PiperOrigin-RevId: 259956061 --- .../compiler/xla/client/local_client.cc | 45 ++++++++++++++++++- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index 427bdf878f0..e8a316882db 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -189,8 +189,49 @@ StatusOr LocalExecutable::RunAsync( ExecutableRunOptions run_options) { TF_ASSIGN_OR_RETURN(auto options_and_stream, RunHelper(arguments, run_options)); - return executable_->ExecuteAsyncOnStream(&options_and_stream.first, - arguments); + se::Stream* stream = run_options.stream(); + + std::shared_ptr snapshot; + if (executable_->dumping_snapshot()) { + snapshot = std::make_shared(); + snapshot->set_execution_platform(backend_->platform()->Name()); + *snapshot->mutable_hlo() = *executable_->hlo_proto(); + for (const ShapedBuffer* arg : arguments) { + auto literal = std::make_shared(arg->on_host_shape()); + backend_->transfer_manager()->TransferLiteralFromDevice( + stream, *arg, literal.get(), [snapshot, literal](Status status) { + if (!status.ok()) { + LOG(ERROR) << "TransferLiteralFromDevice for HLO snapshot inputs " + "failed: " + << status; + return; + } + *snapshot->add_arguments() = literal->ToProto(); + }); + } + } + + TF_ASSIGN_OR_RETURN( + ScopedShapedBuffer outputs, + executable_->ExecuteAsyncOnStream(&options_and_stream.first, arguments)); + + // Transfer the outputs and save the snapshot to disk. + if (snapshot) { + auto literal = std::make_shared(outputs.on_host_shape()); + backend_->transfer_manager()->TransferLiteralFromDevice( + stream, outputs, literal.get(), [snapshot, literal](Status status) { + if (status.ok()) { + *snapshot->mutable_result() = literal->ToProto(); + } else { + LOG(ERROR) + << "TransferLiteralFromDevice for HLO snapshot outputs failed: " + << status; + } + DumpHloSnapshotIfEnabled(*snapshot, GetDebugOptionsFromFlags()); + }); + } + + return std::move(outputs); } StatusOr LocalExecutable::ExecuteAndDump( From 7c03d78fdd90ceb3680690347af7b36868bc15ba Mon Sep 17 00:00:00 2001 From: Reuben Morais Date: Thu, 25 Jul 2019 16:13:59 +0000 Subject: [PATCH 0567/3053] Fix incorrect usage of execution plan in GPU delegate When checking supported ops, instead of using the node id values from the execution plan, the delegate was just using node ids 0..execution_plan.size. In a case where your graph has 20 nodes, and your execution plan covers nodes 5-20, this would instead build a subgraph out of nodes 0-15. --- .../lite/delegates/gpu/common/model_builder.cc | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc index e074023f7c7..97edf5846a4 100644 --- a/tensorflow/lite/delegates/gpu/common/model_builder.cc +++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc @@ -2245,9 +2245,10 @@ TfLiteIntArray* GetOpsToReplaceFromGraphWithDequantize(TfLiteContext* context) { std::vector inputs_from_dequant; std::vector orig_inputs; + int node_id = execution_plan->data[i]; TfLiteNode* node = nullptr; TfLiteRegistration* registration = nullptr; - auto status = GetNodeAndRegistration(context, i, &node, ®istration); + auto status = GetNodeAndRegistration(context, node_id, &node, ®istration); if (!status.ok()) { context->ReportError(context, status.error_message().c_str()); return nullptr; @@ -2258,9 +2259,9 @@ TfLiteIntArray* GetOpsToReplaceFromGraphWithDequantize(TfLiteContext* context) { // Record the output->input mapping for the op. node_map[node->outputs->data[0]] = node->inputs->data[0]; // For now, add the node to the list of ops to replace. - ops_to_replace.push_back(i); + ops_to_replace.push_back(node_id); // Record the dequant node id, indexed by output id. - dequant_nodes[node->outputs->data[0]] = i; + dequant_nodes[node->outputs->data[0]] = node_id; continue; } TfLiteIntArray* inputs = node->inputs; @@ -2337,9 +2338,10 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) { // Dispatch to another function if graph has Dequantize nodes. for (int i = 0; i < execution_plan->size; ++i) { + int node_id = execution_plan->data[i]; TfLiteNode* node = nullptr; TfLiteRegistration* registration = nullptr; - auto status = GetNodeAndRegistration(context, i, &node, ®istration); + auto status = GetNodeAndRegistration(context, node_id, &node, ®istration); if (!status.ok()) { context->ReportError(context, status.error_message().c_str()); return nullptr; @@ -2356,9 +2358,10 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) { subgraph->size = 0; std::set errors; for (int i = 0; i < execution_plan->size; ++i) { + int node_id = execution_plan->data[i]; TfLiteNode* node = nullptr; TfLiteRegistration* registration = nullptr; - auto status = GetNodeAndRegistration(context, i, &node, ®istration); + auto status = GetNodeAndRegistration(context, node_id, &node, ®istration); if (!status.ok()) { context->ReportError(context, status.error_message().c_str()); return nullptr; @@ -2369,7 +2372,7 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) { // registration->builtin_code != kTfLiteBuiltinSub && IsAllFloatTensors(context, node->inputs) && IsAllFloatTensors(context, node->outputs)) { - if (errors.empty()) subgraph->data[subgraph->size++] = i; + if (errors.empty()) subgraph->data[subgraph->size++] = node_id; } else { errors.insert(GetOpNameByRegistration(registration) + ": " + status.error_message()); From 650920e6ac567e4ad17cda78df6681d72e443423 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 25 Jul 2019 09:07:55 -0700 Subject: [PATCH 0568/3053] Fix TensorArray shape inference. Shape merging should not happen if infer_shape=False. PiperOrigin-RevId: 259957628 --- .../kernel_tests/tensor_array_ops_test.py | 32 +++++++++ tensorflow/python/ops/tensor_array_ops.py | 68 +++++++++---------- 2 files changed, 64 insertions(+), 36 deletions(-) diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py index 1cdfdf0436d..68bf5329caf 100644 --- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py +++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py @@ -1747,6 +1747,38 @@ class TensorArrayTest(test.TestCase): self.assertAllEqual(v0, -3) self.assertAllEqual(v1, 100) + def testInferShapeFalseValid(self): + ta = tensor_array_ops.TensorArray( + dtypes.float32, size=3, infer_shape=False, element_shape=[None, 10, 20]) + ta = ta.write(0, array_ops.ones([50, 10, 20])) + ta = ta.write(1, array_ops.ones([50, 10, 20])) + ta = ta.write(2, array_ops.ones([1, 10, 20])) + ta = ta.concat() + + correct = np.ones([101, 10, 20]) + + self.assertAllEqual(ta, correct) + + def testInferShapeFalseInvalid(self): + ta = tensor_array_ops.TensorArray( + dtypes.float32, size=2, infer_shape=False, element_shape=[None, 10, 20]) + ta = ta.write(0, array_ops.ones([50, 10, 20])) + + with self.assertRaises(ValueError): + ta = ta.write(1, array_ops.ones([1, 20, 20])) + + def testInferShapeTrue(self): + ta = tensor_array_ops.TensorArray( + dtypes.float32, size=3, infer_shape=True, element_shape=[None, 10, 20]) + self.assertAllEqual((None, 10, 20), ta.element_shape.as_list()) + ta = ta.write(0, array_ops.ones([50, 10, 20])) + self.assertAllEqual((50, 10, 20), ta.element_shape.as_list()) + ta = ta.write(1, array_ops.ones([50, 10, 20])) + with self.assertRaises(ValueError): + ta = ta.write( + 2, array_ops.ones([1, 10, 20]) + ) # Inconsistent shapes: saw (1, 10, 20) but expected (50, 10, 20) + class TensorArrayBenchmark(test.Benchmark): diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py index fab83c6073f..57fb8f5b7c8 100644 --- a/tensorflow/python/ops/tensor_array_ops.py +++ b/tensorflow/python/ops/tensor_array_ops.py @@ -135,7 +135,7 @@ class _GraphTensorArray(object): # of the first write. If `infer_shape` is true, all writes checks for # shape equality. self._element_shape = [tensor_shape.as_shape(element_shape)] - self._infer_shape = element_shape is not None or infer_shape + self._infer_shape = infer_shape with ops.name_scope(name, "TensorArray", [handle, size, flow]) as scope: if handle is not None: self._handle = handle @@ -179,7 +179,7 @@ class _GraphTensorArray(object): def element_shape(self): return self._element_shape[0] - def _merge_element_shape(self, shape): + def _check_element_shape(self, shape): """Changes the element shape of the array given a shape to merge with. Args: @@ -190,10 +190,10 @@ class _GraphTensorArray(object): element shape of the `TensorArray`. """ if not shape.is_compatible_with(self.element_shape): - raise ValueError( - "Inconsistent shapes: saw %s but expected %s " - "(and infer_shape=True)" % (shape, self.element_shape)) - self._element_shape[0] = self.element_shape.merge_with(shape) + raise ValueError("Inconsistent shapes: saw %s but expected %s " % + (shape, self.element_shape)) + if self._infer_shape: + self._element_shape[0] = self.element_shape.merge_with(shape) @contextlib.contextmanager def _maybe_colocate_with(self, value): @@ -266,8 +266,7 @@ class _GraphTensorArray(object): value = ops.convert_to_tensor( value, preferred_dtype=self._dtype, name="value") _check_dtypes(value, self._dtype) - if self._infer_shape: - self._merge_element_shape(value.shape) + self._check_element_shape(value.shape) with self._maybe_colocate_with(value): flow_out = gen_data_flow_ops.tensor_array_write_v3( handle=self._handle, @@ -329,8 +328,8 @@ class _GraphTensorArray(object): value = ops.convert_to_tensor( value, preferred_dtype=self._dtype, name="value") _check_dtypes(value, self._dtype) - if self._infer_shape and not context.executing_eagerly(): - self._merge_element_shape(value.shape[1:]) + if not context.executing_eagerly(): + self._check_element_shape(value.shape[1:]) with self._maybe_colocate_with(value): flow_out = gen_data_flow_ops.tensor_array_scatter_v3( handle=self._handle, @@ -348,11 +347,11 @@ class _GraphTensorArray(object): value = ops.convert_to_tensor(value, dtype=self._dtype, name="value") with self._maybe_colocate_with(value): lengths_64 = math_ops.cast(lengths, dtypes.int64) - if self._infer_shape and not context.executing_eagerly(): + if not context.executing_eagerly(): clengths = tensor_util.constant_value(lengths_64) - if value.shape.dims is not None: - if clengths is not None and clengths.max() == clengths.min(): - self._merge_element_shape( + if value.shape.dims is not None and clengths is not None: + if clengths.shape and clengths.max() == clengths.min(): + self._check_element_shape( tensor_shape.TensorShape([clengths[0]]).concatenate( value.shape[1:])) flow_out = gen_data_flow_ops.tensor_array_split_v3( @@ -447,7 +446,7 @@ class _GraphTensorArrayV2(object): # of the first write. If `infer_shape` is true, all writes checks for # shape equality. self._element_shape = [tensor_shape.as_shape(element_shape)] - self._infer_shape = element_shape is not None or infer_shape + self._infer_shape = infer_shape with ops.name_scope(name, "TensorArrayV2", [size, flow]) as scope: if flow is None: self._flow = list_ops.tensor_list_reserve( @@ -480,7 +479,7 @@ class _GraphTensorArrayV2(object): # complain. return None - def _merge_element_shape(self, shape): + def _check_element_shape(self, shape): """Changes the element shape of the array given a shape to merge with. Args: @@ -491,10 +490,10 @@ class _GraphTensorArrayV2(object): element shape of the `TensorArray`. """ if not shape.is_compatible_with(self.element_shape): - raise ValueError( - "Inconsistent shapes: saw %s but expected %s " - "(and infer_shape=True)" % (shape, self.element_shape)) - self._element_shape[0] = self.element_shape.merge_with(shape) + raise ValueError("Inconsistent shapes: saw %s but expected %s " % + (shape, self.element_shape)) + if self._infer_shape: + self._element_shape[0] = self.element_shape.merge_with(shape) def identity(self): """See TensorArray.""" @@ -524,8 +523,7 @@ class _GraphTensorArrayV2(object): value = ops.convert_to_tensor( value, preferred_dtype=self._dtype, name="value") _check_dtypes(value, self._dtype) - if self._infer_shape: - self._merge_element_shape(value.shape) + self._check_element_shape(value.shape) flow_out = list_ops.tensor_list_set_item( input_handle=self._flow, index=index, @@ -575,8 +573,7 @@ class _GraphTensorArrayV2(object): value = ops.convert_to_tensor( value, preferred_dtype=self._dtype, name="value") _check_dtypes(value, self._dtype) - if self._infer_shape and not context.executing_eagerly(): - self._merge_element_shape(value.shape[1:]) + self._check_element_shape(value.shape[1:]) flow_out = list_ops.tensor_list_from_tensor( tensor=value, element_shape=value.shape[1:]) return build_ta_with_new_flow(self, flow_out) @@ -590,8 +587,7 @@ class _GraphTensorArrayV2(object): value = ops.convert_to_tensor( value, preferred_dtype=self._dtype, name="value") _check_dtypes(value, self._dtype) - if self._infer_shape and not context.executing_eagerly(): - self._merge_element_shape(value.shape[1:]) + self._check_element_shape(value.shape[1:]) flow_out = list_ops.tensor_list_scatter( tensor=value, indices=indices, element_shape=self.element_shape, input_handle=self._flow) @@ -606,11 +602,11 @@ class _GraphTensorArrayV2(object): value, preferred_dtype=self._dtype, name="value") _check_dtypes(value, self._dtype) lengths_64 = math_ops.cast(lengths, dtypes.int64) - if self._infer_shape and not context.executing_eagerly(): + if not context.executing_eagerly(): clengths = tensor_util.constant_value(lengths_64) - if value.shape.dims is not None: - if clengths is not None and clengths.max() == clengths.min(): - self._merge_element_shape( + if value.shape.dims is not None and clengths is not None: + if clengths.shape and clengths.max() == clengths.min(): + self._check_element_shape( tensor_shape.TensorShape([clengths[0]]).concatenate( value.shape[1:])) flow_out = list_ops.tensor_list_split( @@ -688,7 +684,7 @@ class _EagerTensorArray(object): # we assign a dummy value to _flow in case other code assumes it to be # a Tensor self._flow = constant_op.constant(0, dtype=dtypes.int32) - self._infer_shape = element_shape is not None or infer_shape + self._infer_shape = infer_shape self._element_shape = tensor_shape.as_shape(element_shape) self._colocate_with_first_write_call = colocate_with_first_write_call @@ -804,12 +800,12 @@ class _EagerTensorArray(object): "TensorArray dtype is %s but Op is trying to write dtype %s" % (self._dtype.name, value.dtype.name)) + if not self._element_shape.is_compatible_with(value.shape): + raise ValueError("Incompatible shape for value (%s), expected (%s)" % + (value.shape, self._element_shape)) + if self._infer_shape: - if not self._element_shape.is_compatible_with(value.shape): - raise ValueError("Incompatible shape for value (%s), expected (%s)" % - (value.shape, self._element_shape)) - else: - self._element_shape = self._element_shape.merge_with(value.shape) + self._element_shape = self._element_shape.merge_with(value.shape) self._tensor_array[index] = value From f13bba99231cd61132a7e2078d1eed4dadae515e Mon Sep 17 00:00:00 2001 From: captain-pool Date: Thu, 25 Jul 2019 21:59:25 +0530 Subject: [PATCH 0569/3053] Minor Fix --- tensorflow/python/tools/saved_model_cli.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py index 6335383158d..634ce584919 100644 --- a/tensorflow/python/tools/saved_model_cli.py +++ b/tensorflow/python/tools/saved_model_cli.py @@ -219,14 +219,8 @@ def _print_args(arguments, argument_type="Argument", indent=0): _print_args(element, indent + 1) in_print(' ]') elif isinstance(element, dict): - in_print(' DType: %s' % type(element).__name__) - in_print(' Values: {', end='') + in_print(' {', end='') for (key, value) in element.items(): - if is_nested(element): - in_print('\n \'%s\': [' % str(key), end='') - _print_args(element, indent + 1) - in_print(' ]') - else: print('\'%s\': %s' % (str(key), _may_be_add_quotes(value)), end=', ') print('\b\b}') else: From 3c180988a21626042da280865ad553ffdf145a13 Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Thu, 25 Jul 2019 09:37:41 -0700 Subject: [PATCH 0570/3053] Surround device type in quotes This makes it more obvious when the device type is (incorrectly) an empty string as it happened in a recent bug report. PiperOrigin-RevId: 259962563 --- tensorflow/core/framework/op_kernel.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc index 020b3b205b2..6fe1f4d2090 100644 --- a/tensorflow/core/framework/op_kernel.cc +++ b/tensorflow/core/framework/op_kernel.cc @@ -1477,8 +1477,8 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device, } if (registration == nullptr) { s.Update(errors::NotFound("No registered '", node_def.op(), - "' OpKernel for ", DeviceTypeString(device_type), - " devices compatible with node ", + "' OpKernel for '", DeviceTypeString(device_type), + "' devices compatible with node ", FormatNodeDefForError(node_def))); if (was_attr_mismatch) { errors::AppendToMessage( From 6c7a19ece64e2e944b0d1ba118073a320892bd1e Mon Sep 17 00:00:00 2001 From: captain-pool Date: Thu, 25 Jul 2019 22:23:28 +0530 Subject: [PATCH 0571/3053] Cleaned up code. --- tensorflow/python/tools/saved_model_cli.py | 54 +++++++++++----------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py index 634ce584919..eb5e4a1a8dc 100644 --- a/tensorflow/python/tools/saved_model_cli.py +++ b/tensorflow/python/tools/saved_model_cli.py @@ -24,6 +24,7 @@ from __future__ import division from __future__ import print_function import argparse +import collections import os import re import sys @@ -45,7 +46,6 @@ from tensorflow.python.saved_model import load from tensorflow.python.saved_model import loader from tensorflow.python.saved_model import save from tensorflow.python.tools import saved_model_utils -from tensorflow.python.util import nest # Set of ops to blacklist. _OP_BLACKLIST = set(['WriteFile', 'ReadFile', 'PrintV2']) @@ -173,7 +173,7 @@ def _show_defined_functions(saved_model_dir): with ops_lib.Graph().as_default(): trackable_object = load.load(saved_model_dir) - print('Defined Functions:') + print('\nDefined Functions:') functions = save._AugmentedGraphView( trackable_object).list_functions(trackable_object) for name, function in functions.items(): @@ -181,11 +181,12 @@ def _show_defined_functions(saved_model_dir): for index, concrete_functions in enumerate( function._list_all_concrete_functions_for_serialization(), 1): args, kwargs = concrete_functions.structured_input_signature - print(' Option #%d' % index) - print(' Callable with:') - _print_args(args, indent=3) + print(' Option #%d' % index) + print(' Callable with:') + _print_args(args, indent=4) if kwargs: - _print_args(kwargs, "Named Argument", indent=3) + _print_args(kwargs, "Named Argument", indent=4) + print() def _print_args(arguments, argument_type="Argument", indent=0): @@ -205,27 +206,26 @@ def _print_args(arguments, argument_type="Argument", indent=0): def in_print(s, end='\n'): print(indent_str + s, end=end) - def is_nested(args): - return nest.is_nested(args) and not isinstance(args, dict) - if nest.is_nested(arguments): - for index, element in enumerate(arguments, 1): - if indent == 3: - in_print('%s #%d' % (argument_type, index)) - if isinstance(element, tensor_spec.TensorSpec): - print((indent + 1) * ' ' + '%s: %s'%(element.name, repr(element))) - elif is_nested(element): - in_print(' DType: %s' % type(element).__name__) - in_print(' Values: [', end='') - _print_args(element, indent + 1) - in_print(' ]') - elif isinstance(element, dict): - in_print(' {', end='') - for (key, value) in element.items(): - print('\'%s\': %s' % (str(key), _may_be_add_quotes(value)), end=', ') - print('\b\b}') - else: - in_print(' DType: %s' % type(element).__name__) - in_print(' Value: %s' % str(element)) + for index, element in enumerate(arguments, 1): + if indent == 4: + in_print('%s #%d' % (argument_type, index)) + if isinstance(element, tensor_spec.TensorSpec): + print((indent + 1) * ' ' + '%s: %s'%(element.name, repr(element))) + elif isinstance(element, collections.Iterable) and not isinstance(element, dict): + in_print(' DType: %s' % type(element).__name__) + in_print(' Value: [', end='') + for value in element: + print('%s' % _may_be_add_quotes(value), end=', ') + print('\b\b]') + elif isinstance(element, dict): + in_print(' DType: %s' % type(element).__name__) + in_print(' Value: {', end='') + for (key, value) in element.items(): + print('\'%s\': %s' % (str(key), _may_be_add_quotes(value)), end=', ') + print('\b\b}') + else: + in_print(' DType: %s' % type(element).__name__) + in_print(' Value: %s' % str(element)) def _print_tensor_info(tensor_info, indent=0): """Prints details of the given tensor_info. From a292139559c9993f474a4cc088300eaa8e2b721d Mon Sep 17 00:00:00 2001 From: Ilham Firdausi Putra Date: Fri, 26 Jul 2019 00:02:47 +0700 Subject: [PATCH 0572/3053] Override enumerate on AutoGraph --- .../python/autograph/operators/py_builtins.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py index ab28228c207..d1d5f3e39d1 100644 --- a/tensorflow/python/autograph/operators/py_builtins.py +++ b/tensorflow/python/autograph/operators/py_builtins.py @@ -25,6 +25,7 @@ import inspect import six +from tensorflow.data import Dataset from tensorflow.python.autograph.utils import py_func from tensorflow.python.autograph.utils import tensors from tensorflow.python.framework import constant_op @@ -242,7 +243,21 @@ def _py_range(start_or_stop, stop, step): return range(start_or_stop) -SUPPORTED_BUILTINS = (abs, float, int, len, print, range) +def enumerate_(s, start=0): + if isinstance(s, Dataset): + return _tf_dataset_enumerate(s, start) + return _py_enumerate(s, start) + + +def _tf_dataset_enumerate(s, start=0): + return s.enumerate(start) + + +def _py_enumerate(s, start=0): + return enumerate(s, start) + + +SUPPORTED_BUILTINS = (abs, float, int, len, print, range, enumerate) if six.PY2: SUPPORTED_BUILTINS += (xrange,) @@ -256,4 +271,5 @@ BUILTIN_FUINCTIONS_MAP = { 'range': range_, # TODO(mdan): This might make more sense as tf.data.range. 'xrange': range_, + 'enumerate': enumerate_, } From 288bf2f3112eb5ea25c5a0ac626f05317927e4c2 Mon Sep 17 00:00:00 2001 From: Ilham Firdausi Putra Date: Fri, 26 Jul 2019 00:03:16 +0700 Subject: [PATCH 0573/3053] Add test for enumerate overriding on AutoGraph --- tensorflow/python/autograph/operators/py_builtins_test.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tensorflow/python/autograph/operators/py_builtins_test.py b/tensorflow/python/autograph/operators/py_builtins_test.py index 1be10bf0171..bfb1f808a8e 100644 --- a/tensorflow/python/autograph/operators/py_builtins_test.py +++ b/tensorflow/python/autograph/operators/py_builtins_test.py @@ -137,6 +137,11 @@ class PyBuiltinsTest(test.TestCase): r = py_builtins.range_(5, constant_op.constant(2)) self.assertAllEqual(self.evaluate(r), []) + def test_enumerate(self): + self.assertListEqual(list(py_builtins.enumerate_([3,2,1])), [(0, 3), (1, 2), (2, 1)]) + self.assertListEqual(list(py_builtins.enumerate_([3,2,1], 5)), [(5, 3), (6, 2), (7, 1)]) + self.assertListEqual(list(py_builtins.enumerate_([-8], -3)), [(-3, -8)]) + def test_eval_in_original_context(self): def caller_1(lvl_delta): From 3fd15b97903819c17a1e1c39d93798a5f499a468 Mon Sep 17 00:00:00 2001 From: Ayush Dubey Date: Thu, 25 Jul 2019 10:00:48 -0700 Subject: [PATCH 0574/3053] Automated rollback of commit 93802f756739f8eed9c8d3d654be74a20467f2a9 PiperOrigin-RevId: 259966820 --- tensorflow/core/BUILD | 15 +++ tensorflow/core/kernels/data/BUILD | 1 + .../kernels/data/unbounded_thread_pool.cc | 97 +++------------- .../core/kernels/data/unbounded_thread_pool.h | 36 ++---- .../data/unbounded_thread_pool_test.cc | 62 +---------- .../platform/default/unbounded_work_queue.cc | 101 +++++++++++++++++ .../platform/default/unbounded_work_queue.h | 65 +++++++++++ .../core/platform/unbounded_work_queue.h | 33 ++++++ .../platform/unbounded_work_queue_test.cc | 104 ++++++++++++++++++ 9 files changed, 340 insertions(+), 174 deletions(-) create mode 100644 tensorflow/core/platform/default/unbounded_work_queue.cc create mode 100644 tensorflow/core/platform/default/unbounded_work_queue.h create mode 100644 tensorflow/core/platform/unbounded_work_queue.h create mode 100644 tensorflow/core/platform/unbounded_work_queue_test.cc diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index fd891092e78..e22a017eaa6 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -626,6 +626,20 @@ filegroup( visibility = ["//visibility:private"], ) +tf_cc_test( + name = "platform_unbounded_work_queue_test", + srcs = ["platform/unbounded_work_queue_test.cc"], + deps = [ + ":framework", + ":lib", + ":lib_internal", + ":lib_test_internal", + ":test", + ":test_main", + "@com_google_absl//absl/memory", + ], +) + # Headers that are not exported as part of ":lib". filegroup( name = "platform_other_internal_hdrs", @@ -2465,6 +2479,7 @@ LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [ "platform/snappy.h", "platform/tensor_coding.h", "platform/tracing.h", + "platform/unbounded_work_queue.h", "util/env_var.h", ] diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD index a5f41b6dcae..ef2f843285f 100644 --- a/tensorflow/core/kernels/data/BUILD +++ b/tensorflow/core/kernels/data/BUILD @@ -180,6 +180,7 @@ cc_library( "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", "@com_google_absl//absl/memory", ], ) diff --git a/tensorflow/core/kernels/data/unbounded_thread_pool.cc b/tensorflow/core/kernels/data/unbounded_thread_pool.cc index ac12197f1b8..9bb8f4e92e6 100644 --- a/tensorflow/core/kernels/data/unbounded_thread_pool.cc +++ b/tensorflow/core/kernels/data/unbounded_thread_pool.cc @@ -16,8 +16,9 @@ limitations under the License. #include "tensorflow/core/kernels/data/unbounded_thread_pool.h" #include "absl/memory/memory.h" +#include "tensorflow/core/lib/core/notification.h" #include "tensorflow/core/platform/env.h" -#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/unbounded_work_queue.h" namespace tensorflow { namespace data { @@ -30,7 +31,7 @@ class UnboundedThreadPool::LogicalThreadFactory : public ThreadFactory { std::unique_ptr StartThread(const string& name, std::function fn) override { - return pool_->RunOnPooledThread(std::move(fn)); + return pool_->ScheduleOnWorkQueue(std::move(fn)); } private: @@ -52,8 +53,7 @@ class UnboundedThreadPool::LogicalThreadWrapper : public Thread { // NOTE: The `Thread` destructor is expected to "join" the created thread, // but the physical thread may continue to execute after the work for this // thread is complete. We simulate this by waiting on a notification that - // the `CachedThreadFunc` will notify when the thread's work function is - // complete. + // the thread's work function will notify when it is complete. join_notification_->WaitForNotification(); } @@ -61,96 +61,25 @@ class UnboundedThreadPool::LogicalThreadWrapper : public Thread { std::shared_ptr join_notification_; }; -UnboundedThreadPool::~UnboundedThreadPool() { - { - mutex_lock l(work_queue_mu_); - // Wake up all `CachedThreadFunc` threads and cause them to terminate before - // joining them when `threads_` is cleared. - cancelled_ = true; - work_queue_cv_.notify_all(); - if (!work_queue_.empty()) { - LOG(ERROR) << "UnboundedThreadPool named \"" << thread_name_ << "\" was " - << "deleted with pending work in its queue. This may indicate " - << "a potential use-after-free bug."; - } - } - - { - mutex_lock l(thread_pool_mu_); - // Clear the list of pooled threads, which will eventually terminate due to - // the previous notification. - // - // NOTE: It is safe to do this while holding `pooled_threads_mu_`, because - // no subsequent calls to `this->StartThread()` should be issued after the - // destructor starts. - thread_pool_.clear(); - } -} - std::shared_ptr UnboundedThreadPool::get_thread_factory() { return std::make_shared(this); } -size_t UnboundedThreadPool::size() { - tf_shared_lock l(thread_pool_mu_); - return thread_pool_.size(); +namespace { +void WorkQueueFunc(const std::function& fn, + std::shared_ptr notification) { + fn(); + notification->Notify(); } +} // namespace -std::unique_ptr UnboundedThreadPool::RunOnPooledThread( +std::unique_ptr UnboundedThreadPool::ScheduleOnWorkQueue( std::function fn) { auto join_notification = std::make_shared(); - bool all_threads_busy; - { - // Enqueue a work item for the new thread's function, and wake up a - // cached thread to process it. - mutex_lock l(work_queue_mu_); - work_queue_.push_back({std::move(fn), join_notification}); - work_queue_cv_.notify_one(); - // NOTE: The queue may be non-empty, so we must account for queued work when - // considering how many threads are free. - all_threads_busy = work_queue_.size() > num_idle_threads_; - } - - if (all_threads_busy) { - // Spawn a new physical thread to process the given function. - // NOTE: `PooledThreadFunc` will eventually increment `num_idle_threads_` - // at the beginning of its work loop. - Thread* new_thread = env_->StartThread( - {}, thread_name_, - std::bind(&UnboundedThreadPool::PooledThreadFunc, this)); - - mutex_lock l(thread_pool_mu_); - thread_pool_.emplace_back(new_thread); - } - + unbounded_work_queue_.Schedule( + std::bind(&WorkQueueFunc, std::move(fn), join_notification)); return absl::make_unique(std::move(join_notification)); } -void UnboundedThreadPool::PooledThreadFunc() { - while (true) { - WorkItem work_item; - { - mutex_lock l(work_queue_mu_); - ++num_idle_threads_; - while (!cancelled_ && work_queue_.empty()) { - // Wait for a new work function to be submitted, or the cache to be - // destroyed. - work_queue_cv_.wait(l); - } - if (cancelled_) { - return; - } - work_item = std::move(work_queue_.front()); - work_queue_.pop_front(); - --num_idle_threads_; - } - - work_item.work_function(); - - // Notify any thread that has "joined" the cached thread for this work item. - work_item.done_notification->Notify(); - } -} - } // namespace data } // namespace tensorflow diff --git a/tensorflow/core/kernels/data/unbounded_thread_pool.h b/tensorflow/core/kernels/data/unbounded_thread_pool.h index c84d495b296..90a54b9b19f 100644 --- a/tensorflow/core/kernels/data/unbounded_thread_pool.h +++ b/tensorflow/core/kernels/data/unbounded_thread_pool.h @@ -20,55 +20,33 @@ limitations under the License. #include #include "tensorflow/core/framework/thread_factory.h" -#include "tensorflow/core/lib/core/notification.h" #include "tensorflow/core/platform/env.h" -#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/unbounded_work_queue.h" namespace tensorflow { namespace data { // An `UnboundedThreadPool` provides a mechanism for temporally multiplexing a // potentially large number of "logical" threads onto a smaller number of -// "physical" threads. The multiplexing is achieved by maintaining an internal -// pool of long-running "physical" threads that are used to execute the -// "logical" threads. Like a regular thread, a "logical" thread may block on -// other threads, and the size of the pool will increase to ensure that progress -// is made. This mechanism is recommended in situations where short-lived -// threads are created repeatedly, to avoid the overhead and memory -// fragmentation that can result from excessive thread creation. +// "physical" threads. The multiplexing is achieved by using an +// `UnboundedWorkQueue`. class UnboundedThreadPool { public: UnboundedThreadPool(Env* env, const string& thread_name) - : env_(env), thread_name_(thread_name) {} - ~UnboundedThreadPool(); + : unbounded_work_queue_(env, thread_name) {} + ~UnboundedThreadPool() = default; // Returns an implementation of `ThreadFactory` that can be used to create // logical threads in this pool. std::shared_ptr get_thread_factory(); - // Returns the current number of threads in this pool. - size_t size(); - private: class LogicalThreadFactory; class LogicalThreadWrapper; - struct WorkItem { - std::function work_function; - std::shared_ptr done_notification; - }; - std::unique_ptr RunOnPooledThread(std::function fn); - void PooledThreadFunc(); + std::unique_ptr ScheduleOnWorkQueue(std::function fn); - Env* const env_; // Not owned. - const string thread_name_; - mutex work_queue_mu_; - condition_variable work_queue_cv_ GUARDED_BY(work_queue_mu_); - size_t num_idle_threads_ GUARDED_BY(work_queue_mu_) = 0; - bool cancelled_ GUARDED_BY(work_queue_mu_) = false; - std::deque work_queue_ GUARDED_BY(work_queue_mu_); - mutex thread_pool_mu_; - std::vector> thread_pool_ GUARDED_BY(thread_pool_mu_); + UnboundedWorkQueue unbounded_work_queue_; }; } // namespace data diff --git a/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc b/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc index f996b4f931b..3604be86473 100644 --- a/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc +++ b/tensorflow/core/kernels/data/unbounded_thread_pool_test.cc @@ -23,59 +23,6 @@ namespace tensorflow { namespace data { namespace { -TEST(UnboundedThreadPool, SingleThread) { - UnboundedThreadPool pool(Env::Default(), "test"); - auto thread_factory = pool.get_thread_factory(); - - // Create a thread that updates a variable, and ensure that it runs to - // completion. - std::atomic i(0); - auto thread = thread_factory->StartThread("", [&i]() { ++i; }); - thread.reset(); - - EXPECT_GE(pool.size(), 1); - EXPECT_EQ(1, i); -} - -TEST(UnboundedThreadPool, MultipleThreads) { - UnboundedThreadPool pool(Env::Default(), "test"); - auto thread_factory = pool.get_thread_factory(); - - // Create ten threads that update a variable, and ensure that they all run - // to completion. - std::vector> threads; - const int kNumThreadsToCreate = 10; - std::atomic i(0); - for (int j = 0; j < kNumThreadsToCreate; ++j) { - threads.push_back(thread_factory->StartThread("", [&i]() { ++i; })); - } - threads.clear(); - - EXPECT_GE(pool.size(), 1); - EXPECT_EQ(i, kNumThreadsToCreate); -} - -TEST(UnboundedThreadPool, MultipleThreadsSleepingRandomly) { - UnboundedThreadPool pool(Env::Default(), "test"); - auto thread_factory = pool.get_thread_factory(); - - // Create 1000 threads that sleep for a random period of time then update a - // variable, and ensure that they all run to completion. - std::vector> threads; - const int kNumThreadsToCreate = 1000; - std::atomic i(0); - for (int j = 0; j < kNumThreadsToCreate; ++j) { - threads.push_back(thread_factory->StartThread("", [&i]() { - Env::Default()->SleepForMicroseconds(random::New64() % 10); - ++i; - })); - } - threads.clear(); - - EXPECT_GE(pool.size(), 1); - EXPECT_EQ(i, kNumThreadsToCreate); -} - TEST(UnboundedThreadPool, ConcurrentThreadCreation) { UnboundedThreadPool pool(Env::Default(), "test"); auto thread_factory = pool.get_thread_factory(); @@ -97,7 +44,6 @@ TEST(UnboundedThreadPool, ConcurrentThreadCreation) { } threads.clear(); - EXPECT_GE(pool.size(), 1); EXPECT_EQ(i, kNumThreadsToCreate * kNumThreadsToCreate); } @@ -108,9 +54,7 @@ TEST(UnboundedThreadPool, MultipleBlockingThreads) { std::vector> threads; // Create multiple waves (with increasing sizes) of threads that all block - // before returning, and - // ensure that we create the appropriate number of threads and terminate - // correctly. + // before returning, and ensure that we terminate correctly. std::vector round_sizes = {5, 10, 15, 20}; for (const int round_size : round_sizes) { @@ -129,10 +73,6 @@ TEST(UnboundedThreadPool, MultipleBlockingThreads) { // wave is increasing, we should have at least that number of threads in the // pool. bc.Wait(); - // NOTE: There is a benign race between a new round starting and the - // physical threads from the previous round returning to the pool, so we may - // create more threads than the round_size. - EXPECT_GE(pool.size(), round_size); n.Notify(); threads.clear(); } diff --git a/tensorflow/core/platform/default/unbounded_work_queue.cc b/tensorflow/core/platform/default/unbounded_work_queue.cc new file mode 100644 index 00000000000..249d6358643 --- /dev/null +++ b/tensorflow/core/platform/default/unbounded_work_queue.cc @@ -0,0 +1,101 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/platform/unbounded_work_queue.h" + +#include "absl/memory/memory.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/mutex.h" + +namespace tensorflow { + +UnboundedWorkQueue::UnboundedWorkQueue(Env* env, const string& thread_name) + : env_(env), thread_name_(thread_name) {} + +UnboundedWorkQueue::~UnboundedWorkQueue() { + { + mutex_lock l(work_queue_mu_); + // Wake up all `PooledThreadFunc` threads and cause them to terminate before + // joining them when `threads_` is cleared. + cancelled_ = true; + work_queue_cv_.notify_all(); + if (!work_queue_.empty()) { + LOG(ERROR) << "UnboundedWorkQueue named \"" << thread_name_ << "\" was " + << "deleted with pending work in its queue. This may indicate " + << "a potential use-after-free bug."; + } + } + + { + mutex_lock l(thread_pool_mu_); + // Clear the list of pooled threads, which will eventually terminate due to + // the previous notification. + // + // NOTE: It is safe to do this while holding `pooled_threads_mu_`, because + // no subsequent calls to `this->StartThread()` should be issued after the + // destructor starts. + thread_pool_.clear(); + } +} + +void UnboundedWorkQueue::Schedule(WorkFunction fn) { + bool all_threads_busy; + { + // Enqueue a work item for the new thread's function, and wake up a + // cached thread to process it. + mutex_lock l(work_queue_mu_); + work_queue_.push_back(std::move(fn)); + work_queue_cv_.notify_one(); + // NOTE: The queue may be non-empty, so we must account for queued work when + // considering how many threads are free. + all_threads_busy = work_queue_.size() > num_idle_threads_; + } + + if (all_threads_busy) { + // Spawn a new physical thread to process the given function. + // NOTE: `PooledThreadFunc` will eventually increment `num_idle_threads_` + // at the beginning of its work loop. + Thread* new_thread = + env_->StartThread({}, thread_name_, [this]() { PooledThreadFunc(); }); + + mutex_lock l(thread_pool_mu_); + thread_pool_.emplace_back(new_thread); + } +} + +void UnboundedWorkQueue::PooledThreadFunc() { + while (true) { + WorkFunction fn; + { + mutex_lock l(work_queue_mu_); + ++num_idle_threads_; + while (!cancelled_ && work_queue_.empty()) { + // Wait for a new work function to be submitted, or the cache to be + // destroyed. + work_queue_cv_.wait(l); + } + if (cancelled_) { + return; + } + fn = std::move(work_queue_.front()); + work_queue_.pop_front(); + --num_idle_threads_; + } + + fn(); + } +} + +} // namespace tensorflow diff --git a/tensorflow/core/platform/default/unbounded_work_queue.h b/tensorflow/core/platform/default/unbounded_work_queue.h new file mode 100644 index 00000000000..cba83622a3a --- /dev/null +++ b/tensorflow/core/platform/default/unbounded_work_queue.h @@ -0,0 +1,65 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_UNBOUNDED_WORK_QUEUE_H_ +#define TENSORFLOW_CORE_PLATFORM_DEFAULT_UNBOUNDED_WORK_QUEUE_H_ + +#include +#include +#include + +#include "tensorflow/core/lib/core/notification.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/mutex.h" + +namespace tensorflow { + +// An `UnboundedWorkQueue` provides a mechanism for temporally multiplexing a +// potentially large number of "logical" threads onto a smaller number of +// "physical" threads. The multiplexing is achieved by maintaining an internal +// pool of long-running "physical" threads that are used to execute the +// "logical" threads. Like a regular thread, a "logical" thread may block on +// other threads, and the size of the pool will increase to ensure that progress +// is made. This mechanism is recommended in situations where short-lived +// threads are created repeatedly, to avoid the overhead and memory +// fragmentation that can result from excessive thread creation. +class UnboundedWorkQueue { + public: + UnboundedWorkQueue(Env* env, const string& thread_name); + ~UnboundedWorkQueue(); + + using WorkFunction = std::function; + + // Schedule `fn` on a thread. `fn` may perform blocking work, so if all the + // existing threads are blocked or busy, this may spawn a new thread which + // will be added to the thread pool managed by this work queue. + void Schedule(WorkFunction fn); + + private: + void PooledThreadFunc(); + + Env* const env_; // Not owned. + const string thread_name_; + mutex work_queue_mu_; + condition_variable work_queue_cv_ GUARDED_BY(work_queue_mu_); + size_t num_idle_threads_ GUARDED_BY(work_queue_mu_) = 0; + bool cancelled_ GUARDED_BY(work_queue_mu_) = false; + std::deque work_queue_ GUARDED_BY(work_queue_mu_); + mutex thread_pool_mu_; + std::vector> thread_pool_ GUARDED_BY(thread_pool_mu_); +}; + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_PLATFORM_DEFAULT_UNBOUNDED_WORK_QUEUE_H_ diff --git a/tensorflow/core/platform/unbounded_work_queue.h b/tensorflow/core/platform/unbounded_work_queue.h new file mode 100644 index 00000000000..242980dafa9 --- /dev/null +++ b/tensorflow/core/platform/unbounded_work_queue.h @@ -0,0 +1,33 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_PLATFORM_UNBOUNDED_WORK_QUEUE_H_ +#define TENSORFLOW_CORE_PLATFORM_UNBOUNDED_WORK_QUEUE_H_ + +#include "tensorflow/core/platform/platform.h" + +// An `UnboundedWorkQueue` feeds potentially-blocking work into a thread-pool +// whose size automatically increases with demand. + +#if defined(PLATFORM_GOOGLE) +#include "tensorflow/core/platform/google/unbounded_work_queue.h" +#elif defined(PLATFORM_POSIX) || defined(PLATFORM_POSIX_ANDROID) || \ + defined(PLATFORM_GOOGLE_ANDROID) || defined(PLATFORM_WINDOWS) +#include "tensorflow/core/platform/default/unbounded_work_queue.h" +#else +#error Define the appropriate PLATFORM_ macro for this platform +#endif + +#endif // TENSORFLOW_CORE_PLATFORM_UNBOUNDED_WORK_QUEUE_H_ diff --git a/tensorflow/core/platform/unbounded_work_queue_test.cc b/tensorflow/core/platform/unbounded_work_queue_test.cc new file mode 100644 index 00000000000..03d91cd4893 --- /dev/null +++ b/tensorflow/core/platform/unbounded_work_queue_test.cc @@ -0,0 +1,104 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/platform/unbounded_work_queue.h" + +#include "absl/memory/memory.h" +#include "tensorflow/core/lib/core/blocking_counter.h" +#include "tensorflow/core/lib/random/random.h" +#include "tensorflow/core/platform/test.h" + +namespace tensorflow { +namespace { + +class UnboundedWorkQueueTest : public ::testing::Test { + protected: + UnboundedWorkQueueTest() + : work_queue_( + absl::make_unique(Env::Default(), "test")) {} + ~UnboundedWorkQueueTest() override = default; + + void RunMultipleCopiesOfClosure(const int num_closures, + std::function fn) { + for (int i = 0; i < num_closures; ++i) { + work_queue_->Schedule([this, fn]() { + fn(); + mutex_lock l(mu_); + ++closure_count_; + cond_var_.notify_all(); + }); + } + } + + void BlockUntilClosuresDone(const int num_closures) { + mutex_lock l(mu_); + while (closure_count_ < num_closures) { + cond_var_.wait(l); + } + } + + void ResetQueue() { work_queue_.reset(); } + + int NumClosuresExecuted() { + mutex_lock l(mu_); + return closure_count_; + } + + private: + mutex mu_; + int closure_count_ GUARDED_BY(mu_) = 0; + condition_variable cond_var_; + std::unique_ptr work_queue_; +}; + +TEST_F(UnboundedWorkQueueTest, SingleClosure) { + constexpr int num_closures = 1; + RunMultipleCopiesOfClosure(num_closures, []() {}); + BlockUntilClosuresDone(num_closures); +} + +TEST_F(UnboundedWorkQueueTest, MultipleClosures) { + constexpr int num_closures = 10; + RunMultipleCopiesOfClosure(num_closures, []() {}); + BlockUntilClosuresDone(num_closures); +} + +TEST_F(UnboundedWorkQueueTest, MultipleClosuresSleepingRandomly) { + constexpr int num_closures = 1000; + RunMultipleCopiesOfClosure(num_closures, []() { + Env::Default()->SleepForMicroseconds(random::New64() % 10); + }); + BlockUntilClosuresDone(num_closures); +} + +TEST_F(UnboundedWorkQueueTest, NestedClosures) { + constexpr int num_closures = 10; + // Run `num_closures` closures, each of which runs `num_closures` closures. + RunMultipleCopiesOfClosure(num_closures, [this]() { + RunMultipleCopiesOfClosure(num_closures, []() {}); + }); + BlockUntilClosuresDone(num_closures * num_closures + num_closures); +} + +TEST_F(UnboundedWorkQueueTest, RacyDestructor) { + constexpr int num_closures = 100; + // Run `num_closures` closures, then delete `work_queue_`. + RunMultipleCopiesOfClosure(num_closures, []() {}); + ResetQueue(); + EXPECT_LE(NumClosuresExecuted(), num_closures); +} + +} // namespace +} // namespace tensorflow From 3630a33292a9466b20373f80e04c57616520501d Mon Sep 17 00:00:00 2001 From: Renjie Liu Date: Thu, 25 Jul 2019 10:10:48 -0700 Subject: [PATCH 0575/3053] Fix compilation issues: pack.cc has been renamed to pack_arm.cc PiperOrigin-RevId: 259969091 --- tensorflow/lite/tools/make/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile index c37b7cf67a5..7e34802ef54 100644 --- a/tensorflow/lite/tools/make/Makefile +++ b/tensorflow/lite/tools/make/Makefile @@ -106,7 +106,7 @@ tensorflow/lite/experimental/ruy/context.cc \ tensorflow/lite/experimental/ruy/detect_dotprod.cc \ tensorflow/lite/experimental/ruy/kernel_arm32.cc \ tensorflow/lite/experimental/ruy/kernel_arm64.cc \ -tensorflow/lite/experimental/ruy/pack.cc \ +tensorflow/lite/experimental/ruy/pack_arm.cc \ tensorflow/lite/experimental/ruy/pmu.cc \ tensorflow/lite/experimental/ruy/thread_pool.cc \ tensorflow/lite/experimental/ruy/trace.cc \ From 665d91a1df291aad78af32233ef8b14382cbef00 Mon Sep 17 00:00:00 2001 From: Yilei Yang Date: Thu, 25 Jul 2019 10:23:54 -0700 Subject: [PATCH 0576/3053] Explicitly mark Python binaries/tests with python_version = "PY2". PiperOrigin-RevId: 259971580 --- tensorflow/python/eager/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD index b58bf1875fd..40f1a999e4b 100644 --- a/tensorflow/python/eager/BUILD +++ b/tensorflow/python/eager/BUILD @@ -192,6 +192,7 @@ py_library( py_test( name = "profiler_client_test", srcs = ["profiler_client_test.py"], + python_version = "PY2", srcs_version = "PY2AND3", tags = ["no_pip"], visibility = ["//tensorflow:internal"], From 9b3f1d992a96c626cb39e5199ebcce3c64e89e1b Mon Sep 17 00:00:00 2001 From: Penporn Koanantakool Date: Thu, 25 Jul 2019 10:40:18 -0700 Subject: [PATCH 0577/3053] Incorporate a function name change suggested in PR#30962. #30962 PiperOrigin-RevId: 259975474 --- .../python/debug/cli/analyzer_cli_test.py | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py index 477eb2b04ba..6bb4a28b374 100644 --- a/tensorflow/python/debug/cli/analyzer_cli_test.py +++ b/tensorflow/python/debug/cli/analyzer_cli_test.py @@ -50,7 +50,7 @@ from tensorflow.python.util import tf_inspect # Helper function to accommodate MKL-enabled TensorFlow: # MatMul op is supported by MKL and its name is prefixed with "_Mkl" during the # MKL graph rewrite pass. -def matmul_op_name(): +def _matmul_op_name(): return "_MklMatMul" if test_util.IsMklEnabled() else "MatMul" @@ -678,7 +678,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/add:0" ], [ "VariableV2", "VariableV2", "Identity", "Identity", - matmul_op_name(), "Add" + _matmul_op_name(), "Add" ]) # Check the main menu. @@ -695,7 +695,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], [ "VariableV2", "VariableV2", "Identity", "Identity", - matmul_op_name(), "Add" + _matmul_op_name(), "Add" ], sort_by="timestamp", reverse=True) @@ -711,7 +711,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], [ "VariableV2", "VariableV2", "Identity", "Identity", - matmul_op_name(), "Add" + _matmul_op_name(), "Add" ], sort_by="dump_size") check_main_menu(self, out, list_tensors_enabled=False) @@ -726,7 +726,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], [ "VariableV2", "VariableV2", "Identity", "Identity", - matmul_op_name(), "Add" + _matmul_op_name(), "Add" ], sort_by="dump_size", reverse=True) @@ -748,7 +748,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], [ "VariableV2", "VariableV2", "Identity", "Identity", - matmul_op_name(), "Add" + _matmul_op_name(), "Add" ], sort_by="op_type", reverse=False) @@ -765,7 +765,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], [ "VariableV2", "VariableV2", "Identity", "Identity", - matmul_op_name(), "Add" + _matmul_op_name(), "Add" ], sort_by="op_type", reverse=True) @@ -782,7 +782,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], [ "VariableV2", "VariableV2", "Identity", "Identity", - matmul_op_name(), "Add" + _matmul_op_name(), "Add" ], sort_by="tensor_name", reverse=False) @@ -799,7 +799,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): "simple_mul_add/matmul:0", "simple_mul_add/add:0" ], [ "VariableV2", "VariableV2", "Identity", "Identity", - matmul_op_name(), "Add" + _matmul_op_name(), "Add" ], sort_by="tensor_name", reverse=True) @@ -828,12 +828,12 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): op_type_regex="Identity") out = self._registry.dispatch_command( - "list_tensors", ["-t", "(Add|" + matmul_op_name() + ")"]) + "list_tensors", ["-t", "(Add|" + _matmul_op_name() + ")"]) assert_listed_tensors( self, out, ["simple_mul_add/add:0", "simple_mul_add/matmul:0"], - ["Add", matmul_op_name()], - op_type_regex=("(Add|" + matmul_op_name() + ")")) + ["Add", _matmul_op_name()], + op_type_regex=("(Add|" + _matmul_op_name() + ")")) check_main_menu(self, out, list_tensors_enabled=False) def testListTensorFilterByNodeNameRegexAndOpTypeRegex(self): @@ -869,7 +869,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): assert_listed_tensors( self, out, ["simple_mul_add/matmul:0", "simple_mul_add/add:0"], - [matmul_op_name(), "Add"], + [_matmul_op_name(), "Add"], tensor_filter_name="is_2x1_vector") check_main_menu(self, out, list_tensors_enabled=False) @@ -910,7 +910,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): recipients = [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")] - assert_node_attribute_lines(self, out, node_name, matmul_op_name(), + assert_node_attribute_lines(self, out, node_name, _matmul_op_name(), self._main_device, [("Identity", "simple_mul_add/u/read"), ("Identity", "simple_mul_add/v/read")], [], @@ -942,7 +942,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): self, out, node_name, - matmul_op_name(), + _matmul_op_name(), self._main_device, [("Identity", "simple_mul_add/u/read"), ("Identity", "simple_mul_add/v/read")], [], [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [], @@ -963,7 +963,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): self, out, node_name, - matmul_op_name(), + _matmul_op_name(), self._main_device, [("Identity", "simple_mul_add/u/read"), ("Identity", "simple_mul_add/v/read")], [], [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [], @@ -989,7 +989,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): self, out, node_name, - matmul_op_name(), + _matmul_op_name(), self._main_device, [("Identity", "simple_mul_add/u/read"), ("Identity", "simple_mul_add/v/read")], [], [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [], @@ -1013,7 +1013,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): self, out, node_name, - matmul_op_name(), + _matmul_op_name(), self._main_device, [("Identity", "simple_mul_add/u/read"), ("Identity", "simple_mul_add/v/read")], [], [("Add", "simple_mul_add/add"), ("Add", "simple_mul_add/add")], [], @@ -1035,7 +1035,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase): assert_node_attribute_lines(self, out, node_name, "Identity", self._main_device, [("VariableV2", "simple_mul_add/u")], [], - [(matmul_op_name(), "simple_mul_add/matmul")], + [(_matmul_op_name(), "simple_mul_add/matmul")], []) check_main_menu( self, From ea3185dab5f5fb36ab722ca61b95776408b8e2a9 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 25 Jul 2019 10:56:38 -0700 Subject: [PATCH 0578/3053] Update tensorflow/python/kernel_tests/conv_ops_3d_test.py Co-Authored-By: Penporn Koanantakool <38085909+penpornk@users.noreply.github.com> --- tensorflow/python/kernel_tests/conv_ops_3d_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py index 60a8ad466b1..021bb89ddaf 100644 --- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py +++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py @@ -335,7 +335,7 @@ class Conv3DTest(test.TestCase): conv = nn_ops.conv3d( input_tensor, filter_tensor, - [1, 1, 1, 1, 1], + strides=[1, 1, 1, 1, 1], dilations=[1, 1, 1, 1, 1], padding='SAME', data_format='NDHWC', From 53e16f1488c9c905258b108b57b620296f3daa58 Mon Sep 17 00:00:00 2001 From: Derek Murray Date: Thu, 25 Jul 2019 10:45:56 -0700 Subject: [PATCH 0579/3053] Avoid re-entering the default graph when building a function. PiperOrigin-RevId: 259976938 --- tensorflow/python/framework/ops.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index 61688e5c8bc..aa38da22122 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -5875,8 +5875,9 @@ def _get_graph_from_inputs(op_input_list, graph=None): The appropriate graph to use for the given inputs. """ - if get_default_graph().building_function: - return get_default_graph() + current_default_graph = get_default_graph() + if current_default_graph.building_function: + return current_default_graph op_input_list = tuple(op_input_list) # Handle generators correctly if graph and not isinstance(graph, Graph): @@ -5909,7 +5910,7 @@ def _get_graph_from_inputs(op_input_list, graph=None): raise ValueError("%s is not from the passed-in graph." % graph_element) # 2. If all else fails, we use the default graph, which is always there. - return graph or get_default_graph() + return graph or current_default_graph @tf_export(v1=["GraphKeys"]) @@ -6254,15 +6255,21 @@ class name_scope(object): # pylint: disable=invalid-name raise ValueError( "At least one of name (%s) and default_name (%s) must be provided." % (self._name, self._default_name)) - if self._values is None: - self._values = [] - if self._values: - g = _get_graph_from_inputs(self._values) - self._g_manager = g.as_default() - self._g_manager.__enter__() + + g = get_default_graph() + if self._values and not g.building_function: + # Specialize based on the knowledge that `_get_graph_from_inputs()` + # ignores `inputs` when building a function. + g_from_inputs = _get_graph_from_inputs(self._values) + if g_from_inputs is not g: + g = g_from_inputs + self._g_manager = g.as_default() + self._g_manager.__enter__() + else: + self._g_manager = None else: - g = get_default_graph() self._g_manager = None + try: self._name_scope = g.name_scope(self._name) return self._name_scope.__enter__() From b1ca11610329cb7759a3a60856675da01cc1e6b4 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Thu, 25 Jul 2019 10:58:42 -0700 Subject: [PATCH 0580/3053] Introduce a SidePair concept allowing us to rewrite much internal ruy code taking advantage of LHS<->RHS code symmetry to remove some redundancy. The key motivation was that I want to experiment with some nontrivial changes to how TrMulTask handles the packing of blocks, and I didn't want to have to maintain two copies of this nontrivial code. With this change, this code is now in a EnsurePacked method that's all I'll have to edit. PiperOrigin-RevId: 259980220 --- tensorflow/lite/experimental/ruy/BUILD | 29 ++- tensorflow/lite/experimental/ruy/block_map.cc | 25 +- tensorflow/lite/experimental/ruy/block_map.h | 33 ++- tensorflow/lite/experimental/ruy/dispatch.h | 31 +-- tensorflow/lite/experimental/ruy/kernel.h | 19 +- tensorflow/lite/experimental/ruy/pack.h | 5 - tensorflow/lite/experimental/ruy/prepack.h | 54 ++--- .../lite/experimental/ruy/ruy_advanced.h | 8 +- tensorflow/lite/experimental/ruy/side_pair.h | 54 +++++ tensorflow/lite/experimental/ruy/trace.cc | 23 +- tensorflow/lite/experimental/ruy/trace.h | 7 +- tensorflow/lite/experimental/ruy/trmul.cc | 214 +++++++++--------- tensorflow/lite/experimental/ruy/trmul.h | 39 +--- .../lite/experimental/ruy/trmul_params.h | 60 +++++ 14 files changed, 340 insertions(+), 261 deletions(-) create mode 100644 tensorflow/lite/experimental/ruy/side_pair.h create mode 100644 tensorflow/lite/experimental/ruy/trmul_params.h diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD index 60ad08bbda7..2548f0f6f73 100644 --- a/tensorflow/lite/experimental/ruy/BUILD +++ b/tensorflow/lite/experimental/ruy/BUILD @@ -110,6 +110,12 @@ cc_test( ], ) +cc_library( + name = "side_pair", + hdrs = ["side_pair.h"], + deps = [":check_macros"], +) + cc_library( name = "block_map", srcs = [ @@ -121,6 +127,7 @@ cc_library( deps = [ ":check_macros", ":opt_set", + ":side_pair", ":size_util", "@gemmlowp//:profiler", ], @@ -189,6 +196,7 @@ cc_library( ":block_map", ":check_macros", ":common", + ":side_pair", ":time", ], ) @@ -267,6 +275,7 @@ cc_library( ":opt_set", ":path", ":platform", + ":side_pair", ":size_util", ":spec", ":tune", @@ -295,6 +304,17 @@ cc_library( ], ) +cc_library( + name = "trmul_params", + hdrs = ["trmul_params.h"], + deps = [ + ":context", + ":internal_matrix", + ":side_pair", + ":tune", + ], +) + cc_library( name = "trmul", srcs = ["trmul.cc"], @@ -302,14 +322,16 @@ cc_library( deps = [ ":allocator", ":block_map", + ":check_macros", ":common", ":context", ":internal_matrix", - ":kernel", ":opt_set", - ":pack", + ":side_pair", + ":spec", ":thread_pool", ":trace", + ":trmul_params", ":tune", "@gemmlowp//:profiler", ], @@ -331,8 +353,11 @@ cc_library( ":check_macros", ":common", ":context", + ":kernel", ":matrix", + ":pack", ":path", + ":side_pair", ":size_util", ":spec", ":trmul", diff --git a/tensorflow/lite/experimental/ruy/block_map.cc b/tensorflow/lite/experimental/ruy/block_map.cc index 74055801d39..501466396bf 100644 --- a/tensorflow/lite/experimental/ruy/block_map.cc +++ b/tensorflow/lite/experimental/ruy/block_map.cc @@ -23,7 +23,7 @@ limitations under the License. namespace ruy { void GetBlockByIndex(const BlockMap& block_map, std::uint32_t index, - std::uint16_t* block_r, std::uint16_t* block_c) { + SidePair* block) { gemmlowp::ScopedProfilingLabel label("GetBlockByIndex"); std::uint16_t rectr = index & ((1 << block_map.rows_rectangularness_log2) - 1); @@ -60,8 +60,8 @@ void GetBlockByIndex(const BlockMap& block_map, std::uint32_t index, bc = (bc << block_map.cols_rectangularness_log2) + rectc; // Store - *block_r = br; - *block_c = bc; + (*block)[Side::kLhs] = br; + (*block)[Side::kRhs] = bc; } namespace { @@ -208,9 +208,12 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows, block_map->missc = missc; } -void GetBlockMatrixCoords(const BlockMap& block_map, std::uint16_t block_r, - std::uint16_t block_c, int* start_r, int* start_c, - int* end_r, int* end_c) { +void GetBlockMatrixCoords(const BlockMap& block_map, + const SidePair& block, + SidePair* start, SidePair* end) { + std::uint16_t block_r = block[Side::kLhs]; + std::uint16_t block_c = block[Side::kRhs]; + gemmlowp::ScopedProfilingLabel label("GetBlockMatrixCoords"); int sr = block_r * block_map.smallr + std::min(block_r, block_map.missr) * block_map.kernel_rows; @@ -230,17 +233,17 @@ void GetBlockMatrixCoords(const BlockMap& block_map, std::uint16_t block_r, sc = std::max(0, ec - round_up_pot(ec - sc, block_map.kernel_cols)); sr = std::max(0, er - round_up_pot(er - sr, block_map.kernel_rows)); - *start_c = sc; - *end_c = ec; - *start_r = sr; - *end_r = er; - RUY_DCHECK_LE(ec, block_map.cols); RUY_DCHECK_LE(er, block_map.rows); RUY_DCHECK_LT(sc, ec); RUY_DCHECK_LT(sr, er); RUY_DCHECK_GE(sc, 0); RUY_DCHECK_GE(sr, 0); + + (*start)[Side::kLhs] = sr; + (*end)[Side::kLhs] = er; + (*start)[Side::kRhs] = sc; + (*end)[Side::kRhs] = ec; } } // namespace ruy diff --git a/tensorflow/lite/experimental/ruy/block_map.h b/tensorflow/lite/experimental/ruy/block_map.h index b0567ea481f..1708421af63 100644 --- a/tensorflow/lite/experimental/ruy/block_map.h +++ b/tensorflow/lite/experimental/ruy/block_map.h @@ -18,6 +18,7 @@ limitations under the License. #include +#include "tensorflow/lite/experimental/ruy/side_pair.h" namespace ruy { enum class BlockMapTraversalOrder { @@ -114,28 +115,24 @@ void MakeBlockMap(int rows, int cols, int depth, int kernel_rows, int kernel_cols, int lhs_scalar_size, int rhs_scalar_size, int cache_friendly_traversal_threshold, BlockMap* block_map); -// Maps an integer index to a (block_r, block_c) block position in the grid. +// Maps an integer index to a block position in the grid. void GetBlockByIndex(const BlockMap& block_map, std::uint32_t index, - std::uint16_t* block_r, std::uint16_t* block_c); + SidePair* block); -// Given a (block_r, block_c) block position in the grid, returns its actual +// Given a block position in the grid, returns its actual // position in the matrix that the BlockMap refers to in terms of -// actual row/column indices: starting at row start_r and column start_c, -// ending at row (end_r - 1) and column (end_c - 1). -void GetBlockMatrixCoords(const BlockMap& block_map, std::uint16_t block_r, - std::uint16_t block_c, int* start_r, int* start_c, - int* end_r, int* end_c); +// actual row/column indices. +void GetBlockMatrixCoords(const BlockMap& block_map, + const SidePair& block, + SidePair* start, SidePair* end); -// Returns the number of grid subdivisions along the rows dimension. -inline std::uint16_t NumBlocksOfRows(const BlockMap& block_map) { - return 1 << (block_map.num_blocks_base_log2 + - block_map.rows_rectangularness_log2); -} - -// Returns the number of grid subdivisions along the columns dimension. -inline std::uint16_t NumBlocksOfCols(const BlockMap& block_map) { - return 1 << (block_map.num_blocks_base_log2 + - block_map.cols_rectangularness_log2); +// Returns the number of grid subdivisions along the rows dimension (if +// side == kLhs) or columns dimension (if side == kRhs). +inline std::uint16_t NumBlocksPerSide(Side side, const BlockMap& block_map) { + int rectangularness_log2 = side == Side::kLhs + ? block_map.rows_rectangularness_log2 + : block_map.cols_rectangularness_log2; + return 1 << (block_map.num_blocks_base_log2 + rectangularness_log2); } // Returns the overall number of blocks in diff --git a/tensorflow/lite/experimental/ruy/dispatch.h b/tensorflow/lite/experimental/ruy/dispatch.h index 9044be70bb7..aab8d2dbbfe 100644 --- a/tensorflow/lite/experimental/ruy/dispatch.h +++ b/tensorflow/lite/experimental/ruy/dispatch.h @@ -38,7 +38,9 @@ limitations under the License. #include "profiling/instrumentation.h" #include "tensorflow/lite/experimental/ruy/common.h" #include "tensorflow/lite/experimental/ruy/context.h" +#include "tensorflow/lite/experimental/ruy/kernel.h" #include "tensorflow/lite/experimental/ruy/matrix.h" +#include "tensorflow/lite/experimental/ruy/pack.h" #include "tensorflow/lite/experimental/ruy/spec.h" #include "tensorflow/lite/experimental/ruy/trmul.h" @@ -108,10 +110,10 @@ void EnforceDstSpecSupport(const Spec& spec, DstScalar dst_zero_point) { RUY_DCHECK(spec.multiplier_exponent_perchannel == nullptr); } -inline bool IsColMajorTrMul(const DMatrix& lhs, const DMatrix& rhs, - const DMatrix& dst) { - return IsColMajor(lhs.layout) && IsColMajor(rhs.layout) && - IsColMajor(dst.layout); +inline bool IsColMajorTrMul(const TrMulParams& params) { + return IsColMajor(params.src[Side::kLhs].layout) && + IsColMajor(params.src[Side::kRhs].layout) && + IsColMajor(params.dst.layout); } inline void CreatePackedLayout(const Layout& src, const Type& scalar, @@ -131,8 +133,8 @@ inline void CreatePackedLayout(const Layout& src, const Type& scalar, } template -void CreatePackedMatrix(const DMatrix& src, const KernelLayout& kernel_layout, - PMatrix* packed) { +void CreatePackedMatrix(Side side, const KernelLayout& kernel_layout, + TrMulParams* params) { // Ruy always uses 32-bit signed accumulators for quantized // matrix multiplication, so we would like to always use std::int32_t // unconditionally for SumsType. @@ -142,6 +144,8 @@ void CreatePackedMatrix(const DMatrix& src, const KernelLayout& kernel_layout, typename std::conditional::value, Scalar, std::int32_t>::type; + const DMatrix& src = params->src[side]; + PMatrix* packed = ¶ms->packed[side]; packed->data_type = Type::Create(); packed->sums_type = Type::Create(); CreatePackedLayout(src.layout, packed->data_type, kernel_layout, @@ -160,7 +164,7 @@ void PopulateTrMulParams(TrMulParams* params) { if (ThePath != Path::kStandardCpp) { // The optimized code paths currently only handle the case of all matrices // being column major. - if (!IsColMajorTrMul(params->lhs, params->rhs, params->dst)) { + if (!IsColMajorTrMul(*params)) { fallback_to_standard_cpp = true; } } @@ -179,13 +183,12 @@ void PopulateTrMulParams(TrMulParams* params) { using RhsKernelLayout = typename Kernel::RhsLayout; CreatePackedMatrix( - params->lhs, ToKernelLayout(), ¶ms->packed_lhs); + Side::kLhs, ToKernelLayout(), params); CreatePackedMatrix( - params->rhs, ToKernelLayout(), ¶ms->packed_rhs); - - params->lhs_run_pack = + Side::kRhs, ToKernelLayout(), params); + params->run_pack[Side::kLhs] = &RunPack; - params->rhs_run_pack = + params->run_pack[Side::kRhs] = &RunPack; params->run_kernel = &RunKernel; @@ -304,8 +307,8 @@ void CreateTrMulParams(const Matrix& lhs, Context* context, Matrix* dst, Path the_path, TrMulParams* params) { // Fill in the fields we already know. - params->lhs = ToDMatrix(lhs); - params->rhs = ToDMatrix(rhs); + params->src[Side::kLhs] = ToDMatrix(lhs); + params->src[Side::kRhs] = ToDMatrix(rhs); params->dst = ToDMatrix(*dst); params->spec = ToVoidPtr(&spec); diff --git a/tensorflow/lite/experimental/ruy/kernel.h b/tensorflow/lite/experimental/ruy/kernel.h index 0c7a2e356f5..8bfacf26d3b 100644 --- a/tensorflow/lite/experimental/ruy/kernel.h +++ b/tensorflow/lite/experimental/ruy/kernel.h @@ -26,6 +26,7 @@ limitations under the License. #include "tensorflow/lite/experimental/ruy/opt_set.h" #include "tensorflow/lite/experimental/ruy/path.h" #include "tensorflow/lite/experimental/ruy/platform.h" +#include "tensorflow/lite/experimental/ruy/side_pair.h" #include "tensorflow/lite/experimental/ruy/size_util.h" #include "tensorflow/lite/experimental/ruy/spec.h" #include "tensorflow/lite/experimental/ruy/tune.h" @@ -76,21 +77,17 @@ void RunKernelTyped(Tuning tuning, const PackedMatrix& lhs, // Main entry point for kernels. template -void RunKernel(Tuning tuning, const PMatrix& lhs, const PMatrix& rhs, - void* spec, int start_row, int start_col, int end_row, - int end_col, DMatrix* dst) { +void RunKernel(Tuning tuning, const SidePair& src, void* spec, + const SidePair& start, const SidePair& end, + DMatrix* dst) { Matrix mdst = ToMatrix(*dst); RunKernelTyped( - tuning, ToPackedMatrix(lhs), ToPackedMatrix(rhs), - *static_cast(spec), start_row, start_col, end_row, end_col, - &mdst); + tuning, ToPackedMatrix(src[Side::kLhs]), + ToPackedMatrix(src[Side::kRhs]), + *static_cast(spec), start[Side::kLhs], start[Side::kRhs], + end[Side::kLhs], end[Side::kRhs], &mdst); } -// The signature of RunKernel is the same, regardless of template parameters. -using RunKernelFn = - decltype(RunKernel>); - // Copied from TF Lite code. inline std::int32_t MultiplyByQuantizedMultiplier( std::int32_t x, std::int32_t quantized_multiplier, int shift) { diff --git a/tensorflow/lite/experimental/ruy/pack.h b/tensorflow/lite/experimental/ruy/pack.h index 8a4034cd2f2..9b7d6265ab7 100644 --- a/tensorflow/lite/experimental/ruy/pack.h +++ b/tensorflow/lite/experimental/ruy/pack.h @@ -492,11 +492,6 @@ void RunPack(Tuning tuning, const DMatrix& src_matrix, PMatrix* packed_matrix, tuning, src, &packed, start_col, end_col); } -// The signature of RunPack is the same, regardless of its template parameters. -using RunPackFn = decltype( - RunPack, - std::int8_t, std::int8_t>); - } // namespace ruy #endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PACK_H_ diff --git a/tensorflow/lite/experimental/ruy/prepack.h b/tensorflow/lite/experimental/ruy/prepack.h index 9019efa5de6..d7b7888dd14 100644 --- a/tensorflow/lite/experimental/ruy/prepack.h +++ b/tensorflow/lite/experimental/ruy/prepack.h @@ -24,6 +24,7 @@ limitations under the License. #include "tensorflow/lite/experimental/ruy/dispatch.h" #include "tensorflow/lite/experimental/ruy/matrix.h" #include "tensorflow/lite/experimental/ruy/path.h" +#include "tensorflow/lite/experimental/ruy/side_pair.h" #include "tensorflow/lite/experimental/ruy/spec.h" #include "tensorflow/lite/experimental/ruy/tune.h" @@ -34,8 +35,7 @@ template & lhs, const Matrix& rhs, const Spec& spec, Context* context, Matrix* dst, - PrepackedMatrix* prepacked_lhs, - PrepackedMatrix* prepacked_rhs, + SidePair prepacked, std::function alloc_fn) { gemmlowp::ScopedProfilingLabel label("PrePackForMul"); Path the_path = context->GetPathToTake(); @@ -47,24 +47,21 @@ void PrePackForMulInternal(const Matrix& lhs, CreateTrMulParams(transposed_lhs, rhs, spec, context, dst, the_path, ¶ms); + const SidePair origin{0, 0}; + const SidePair rounded_dims{params.packed[Side::kLhs].layout.cols, + params.packed[Side::kRhs].layout.cols}; + Tuning tuning = context->GetMainThreadTuning(); - if (prepacked_lhs) { - prepacked_lhs->data_size = DataSize(params.packed_lhs); - prepacked_lhs->sums_size = SumsSize(params.packed_lhs); - prepacked_lhs->data = alloc_fn(prepacked_lhs->data_size); - prepacked_lhs->sums = alloc_fn(prepacked_lhs->sums_size); - params.packed_lhs.data = prepacked_lhs->data; - params.packed_lhs.sums = prepacked_lhs->sums; - params.LhsRunPack(tuning, 0, params.packed_lhs.layout.cols); - } - if (prepacked_rhs) { - prepacked_rhs->data_size = DataSize(params.packed_rhs); - prepacked_rhs->sums_size = SumsSize(params.packed_rhs); - prepacked_rhs->data = alloc_fn(prepacked_rhs->data_size); - prepacked_rhs->sums = alloc_fn(prepacked_rhs->sums_size); - params.packed_rhs.data = prepacked_rhs->data; - params.packed_rhs.sums = prepacked_rhs->sums; - params.RhsRunPack(tuning, 0, params.packed_rhs.layout.cols); + for (Side side : {Side::kLhs, Side::kRhs}) { + if (prepacked[side]) { + prepacked[side]->data_size = DataSize(params.packed[side]); + prepacked[side]->sums_size = SumsSize(params.packed[side]); + prepacked[side]->data = alloc_fn(prepacked[side]->data_size); + prepacked[side]->sums = alloc_fn(prepacked[side]->sums_size); + params.packed[side].data = prepacked[side]->data; + params.packed[side].sums = prepacked[side]->sums; + params.RunPack(side, tuning, origin, rounded_dims); + } } } @@ -73,8 +70,7 @@ template & lhs, const Matrix& rhs, const Spec& spec, Context* context, Matrix* dst, - PrepackedMatrix* prepacked_lhs, - PrepackedMatrix* prepacked_rhs) { + SidePair prepacked) { gemmlowp::ScopedProfilingLabel label("MulWithPrepacked"); EnforceLayoutSupport(lhs.layout, rhs.layout, dst->layout); @@ -90,16 +86,14 @@ void MulWithPrepackedInternal(const Matrix& lhs, CreateTrMulParams(transposed_lhs, rhs, spec, context, dst, the_path, ¶ms); - if (prepacked_lhs) { - params.packed_lhs.data = prepacked_lhs->data; - params.packed_lhs.sums = prepacked_lhs->sums; - params.lhs_is_prepacked = true; - } - if (prepacked_rhs) { - params.packed_rhs.data = prepacked_rhs->data; - params.packed_rhs.sums = prepacked_rhs->sums; - params.rhs_is_prepacked = true; + for (Side side : {Side::kLhs, Side::kRhs}) { + if (prepacked[side]) { + params.packed[side].data = prepacked[side]->data; + params.packed[side].sums = prepacked[side]->sums; + params.is_prepacked[side] = true; + } } + TrMul(¶ms, context); } diff --git a/tensorflow/lite/experimental/ruy/ruy_advanced.h b/tensorflow/lite/experimental/ruy/ruy_advanced.h index 36382e7d8e5..66b09ad9c4b 100644 --- a/tensorflow/lite/experimental/ruy/ruy_advanced.h +++ b/tensorflow/lite/experimental/ruy/ruy_advanced.h @@ -40,8 +40,9 @@ void PrePackForMul(const Matrix& lhs, const Matrix& rhs, PrepackedMatrix* prepacked_lhs, PrepackedMatrix* prepacked_rhs, std::function alloc_fn) { - PrePackForMulInternal(lhs, rhs, spec, context, dst, - prepacked_lhs, prepacked_rhs, alloc_fn); + SidePair prepacked(prepacked_lhs, prepacked_rhs); + PrePackForMulInternal(lhs, rhs, spec, context, dst, prepacked, + alloc_fn); } template & lhs, Context* context, Matrix* dst, PrepackedMatrix* prepacked_lhs, PrepackedMatrix* prepacked_rhs) { + SidePair prepacked(prepacked_lhs, prepacked_rhs); MulWithPrepackedInternal(lhs, rhs, spec, context, dst, - prepacked_lhs, prepacked_rhs); + prepacked); } } // namespace ruy diff --git a/tensorflow/lite/experimental/ruy/side_pair.h b/tensorflow/lite/experimental/ruy/side_pair.h new file mode 100644 index 00000000000..b20a2d1ef43 --- /dev/null +++ b/tensorflow/lite/experimental/ruy/side_pair.h @@ -0,0 +1,54 @@ +/* Copyright 2019 Google LLC. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIDE_PAIR_H_ +#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIDE_PAIR_H_ + +#include "tensorflow/lite/experimental/ruy/check_macros.h" + +namespace ruy { + +enum class Side { kLhs = 0, kRhs = 1 }; + +template +class SidePair final { + public: + SidePair() {} + SidePair(const T& a, const T& b) : elem_{a, b} {} + const T& operator[](Side side) const { + const int index = static_cast(side); + // Technically this check is vacuous, since other values would be + // out-of-range for enum Side. + RUY_DCHECK(index == 0 || index == 1); + return elem_[index]; + } + + T& operator[](Side side) { + const int index = static_cast(side); + // Technically this check is vacuous, since other values would be + // out-of-range for enum Side. + RUY_DCHECK(index == 0 || index == 1); + return elem_[index]; + } + + private: + static_assert(static_cast(Side::kLhs) == 0, ""); + static_assert(static_cast(Side::kRhs) == 1, ""); + T elem_[2]; +}; + +} // namespace ruy + +#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_SIDE_PAIR_H_ diff --git a/tensorflow/lite/experimental/ruy/trace.cc b/tensorflow/lite/experimental/ruy/trace.cc index c84a59e01b4..386f5720830 100644 --- a/tensorflow/lite/experimental/ruy/trace.cc +++ b/tensorflow/lite/experimental/ruy/trace.cc @@ -35,8 +35,7 @@ struct BlockTraceEntry { std::uint32_t thread_id = 0; TimePoint time_reserved; TimePoint time_computed_coords; - TimePoint time_packed_lhs; - TimePoint time_packed_rhs; + SidePair time_packed; TimePoint time_finished; }; @@ -135,8 +134,10 @@ struct ProcessedTrace { Add(Event::kBlockReserved, entry.thread_id, i, entry.time_reserved); Add(Event::kBlockComputedCoords, entry.thread_id, i, entry.time_computed_coords); - Add(Event::kBlockPackedLhs, entry.thread_id, i, entry.time_packed_lhs); - Add(Event::kBlockPackedRhs, entry.thread_id, i, entry.time_packed_rhs); + Add(Event::kBlockPackedLhs, entry.thread_id, i, + entry.time_packed[Side::kLhs]); + Add(Event::kBlockPackedRhs, entry.thread_id, i, + entry.time_packed[Side::kRhs]); Add(Event::kBlockFinished, entry.thread_id, i, entry.time_finished); } std::sort(entries.begin(), entries.end(), @@ -307,21 +308,13 @@ void TraceRecordBlockCoordsComputed(std::uint32_t block_id, Trace* trace) { } } -void TraceRecordBlockPackedLhs(std::uint32_t block_id, Trace* trace) { +void TraceRecordBlockPacked(Side side, std::uint32_t block_id, Trace* trace) { if (trace) { RUY_DCHECK(trace->life_stage == Trace::LifeStage::kRecordingBlockAndThreadFields); TimePoint now = Clock::now(); - relaxed_atomic_store(&trace->block_entries[block_id].time_packed_lhs, now); - } -} - -void TraceRecordBlockPackedRhs(std::uint32_t block_id, Trace* trace) { - if (trace) { - RUY_DCHECK(trace->life_stage == - Trace::LifeStage::kRecordingBlockAndThreadFields); - TimePoint now = Clock::now(); - relaxed_atomic_store(&trace->block_entries[block_id].time_packed_rhs, now); + relaxed_atomic_store(&trace->block_entries[block_id].time_packed[side], + now); } } diff --git a/tensorflow/lite/experimental/ruy/trace.h b/tensorflow/lite/experimental/ruy/trace.h index ecd793dd0b8..1c405794850 100644 --- a/tensorflow/lite/experimental/ruy/trace.h +++ b/tensorflow/lite/experimental/ruy/trace.h @@ -22,6 +22,7 @@ limitations under the License. #include #include "tensorflow/lite/experimental/ruy/block_map.h" +#include "tensorflow/lite/experimental/ruy/side_pair.h" namespace ruy { @@ -47,8 +48,7 @@ void TraceRecordThreadLoopStart(std::uint32_t thread_id, Trace* trace); void TraceRecordBlockReserved(std::uint32_t thread_id, std::uint32_t block_id, Trace* trace); void TraceRecordBlockCoordsComputed(std::uint32_t block_id, Trace* trace); -void TraceRecordBlockPackedLhs(std::uint32_t block_id, Trace* trace); -void TraceRecordBlockPackedRhs(std::uint32_t block_id, Trace* trace); +void TraceRecordBlockPacked(Side side, std::uint32_t block_id, Trace* trace); void TraceRecordBlockFinished(std::uint32_t block_id, Trace* trace); void TraceRecordThreadEnd(std::uint32_t thread_id, Trace* trace); void TraceRecordStart(Trace* trace); @@ -66,8 +66,7 @@ inline void TraceRecordThreadStart(std::uint32_t, Trace*) {} inline void TraceRecordThreadLoopStart(std::uint32_t, Trace*) {} inline void TraceRecordBlockReserved(std::uint32_t, std::uint32_t, Trace*) {} inline void TraceRecordBlockCoordsComputed(std::uint32_t, Trace*) {} -inline void TraceRecordBlockPackedLhs(std::uint32_t, Trace*) {} -inline void TraceRecordBlockPackedRhs(std::uint32_t, Trace*) {} +inline void TraceRecordBlockPacked(Side, std::uint32_t, Trace*) {} inline void TraceRecordBlockFinished(std::uint32_t, Trace*) {} inline void TraceRecordThreadEnd(std::uint32_t, Trace*) {} inline void TraceRecordStart(Trace*) {} diff --git a/tensorflow/lite/experimental/ruy/trmul.cc b/tensorflow/lite/experimental/ruy/trmul.cc index 39f0171b838..a864cc79c04 100644 --- a/tensorflow/lite/experimental/ruy/trmul.cc +++ b/tensorflow/lite/experimental/ruy/trmul.cc @@ -22,6 +22,8 @@ limitations under the License. #include "tensorflow/lite/experimental/ruy/block_map.h" #include "tensorflow/lite/experimental/ruy/common.h" #include "tensorflow/lite/experimental/ruy/opt_set.h" +#include "tensorflow/lite/experimental/ruy/side_pair.h" +#include "tensorflow/lite/experimental/ruy/spec.h" #include "tensorflow/lite/experimental/ruy/thread_pool.h" #include "tensorflow/lite/experimental/ruy/trace.h" @@ -31,16 +33,15 @@ namespace { struct TrMulTask final : Task { TrMulTask(TrMulParams* params_, const BlockMap& block_map_, - std::atomic* atomic_n_, std::uint32_t thread_id_, - std::atomic* lhs_packed_, std::atomic* rhs_packed_, + std::atomic* atomic_block_id_, + std::uint32_t thread_id_, SidePair*> packed_, TuningResolver* tuning_resolver_, Allocator* local_allocator_, Trace* trace_) : params(params_), block_map(block_map_), - atomic_n(atomic_n_), + atomic_block_id(atomic_block_id_), thread_id(thread_id_), - lhs_packed(lhs_packed_), - rhs_packed(rhs_packed_), + packed(packed_), tuning_resolver(tuning_resolver_), local_allocator(local_allocator_), trace(trace_) {} @@ -48,81 +49,56 @@ struct TrMulTask final : Task { void Run() override { TraceRecordThreadStart(thread_id, trace); - std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map); - std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map); - std::uint32_t num_blocks = NumBlocks(block_map); + // Local indicators of packedness to avoid the overhead of atomic ops. + SidePair local_packed{nullptr, nullptr}; - bool* local_lhs_packed = nullptr; - bool* local_rhs_packed = nullptr; + for (Side side : {Side::kLhs, Side::kRhs}) { + if (packed[side]) { + const int size = NumBlocksPerSide(side, block_map); + local_allocator->Allocate(size, &local_packed[side]); + memset(local_packed[side], 0, size * sizeof(bool)); + } + } - if (lhs_packed) { - local_allocator->Allocate(num_blocks_of_rows, &local_lhs_packed); - memset(local_lhs_packed, 0, num_blocks_of_rows * sizeof(bool)); - } - if (rhs_packed) { - local_allocator->Allocate(num_blocks_of_cols, &local_rhs_packed); - memset(local_rhs_packed, 0, num_blocks_of_cols * sizeof(bool)); - } + const std::uint32_t num_blocks = NumBlocks(block_map); const Tuning tuning = tuning_resolver->Resolve(); TraceRecordThreadLoopStart(thread_id, trace); - std::uint16_t block_r, block_c; - int start_r, start_c, end_r, end_c; + SidePair block; + SidePair start; + SidePair end; // Each thread starts by initially reserving the block whose id // is the thread id. - std::uint32_t n = thread_id; - TraceRecordBlockReserved(thread_id, n, trace); + std::uint32_t block_id = thread_id; + TraceRecordBlockReserved(thread_id, block_id, trace); - while (n < num_blocks) { + while (block_id < num_blocks) { // Reserve the next block to handle. In order to hide the latency // (typically comparable to an access to the level of data cache that // is shared among CPU cores, e.g. 60 cycles on an ARM CPU as of 2019) // of this atomic operation, we structure this code so as to avoid // immediately depending on the `next_n` result. - const std::uint32_t next_n = - atomic_n->fetch_add(1, std::memory_order_relaxed); - TraceRecordBlockReserved(thread_id, next_n, trace); + const std::uint32_t next_block_id = + atomic_block_id->fetch_add(1, std::memory_order_relaxed); + TraceRecordBlockReserved(thread_id, next_block_id, trace); // Get coordinates of the current block to handle, in "block space". - GetBlockByIndex(block_map, n, &block_r, &block_c); + GetBlockByIndex(block_map, block_id, &block); // Get coordinates of the current block to handle, in matrix space. - GetBlockMatrixCoords(block_map, block_r, block_c, &start_r, &start_c, - &end_r, &end_c); - TraceRecordBlockCoordsComputed(n, trace); - // Maybe pack the current LHS block, if not already packed. - // Note that if two threads concurrently hit the same LHS block to pack, - // we allow them to concurrently pack it, writing the same packed matrix - // data to the same location. That is considered worth it to avoid - // having one thread blocked on another one. Avoiding that is considered - // important especially on mobile, where there can be large speed - // discrepancy between threads, e.g. if different threads are scheduled - // on CPU cores of different types (big/little), different clock speed, - // different contention with other processes. - if (local_lhs_packed && !local_lhs_packed[block_r]) { - if (!lhs_packed[block_r].load(std::memory_order_acquire)) { - params->LhsRunPack(tuning, start_r, end_r); - TraceRecordBlockPackedLhs(n, trace); - local_lhs_packed[block_r] = true; - lhs_packed[block_r].store(true, std::memory_order_release); - } - } - // Maybe pack the current RHS block. Same comments as above for LHS. - if (local_rhs_packed && !local_rhs_packed[block_c]) { - if (!rhs_packed[block_c].load(std::memory_order_acquire)) { - params->RhsRunPack(tuning, start_c, end_c); - TraceRecordBlockPackedRhs(n, trace); - local_rhs_packed[block_c] = true; - rhs_packed[block_c].store(true, std::memory_order_release); - } + GetBlockMatrixCoords(block_map, block, &start, &end); + TraceRecordBlockCoordsComputed(block_id, trace); + // Maybe pack the current LHS/RHS block, if not already packed. + for (Side side : {Side::kLhs, Side::kRhs}) { + EnsurePacked(side, block_id, local_packed, block, start, end, tuning); } // Actually do matrix multiplication work - params->RunKernel(tuning, start_r, start_c, end_r, end_c); - TraceRecordBlockFinished(n, trace); + params->RunKernel(tuning, start, end); + TraceRecordBlockFinished(block_id, trace); // Move on to the next block as obtained by the atomic increment // at the start of this while loop iteration. - n = next_n; + block_id = next_block_id; } local_allocator->FreeAll(); @@ -131,12 +107,34 @@ struct TrMulTask final : Task { } private: + void EnsurePacked(Side side, std::uint32_t block_id, + const SidePair local_packed, + const SidePair& block, + const SidePair& start, const SidePair& end, + Tuning tuning) { + // If two threads concurrently hit the same block to pack, + // we allow them to concurrently pack it, writing the same packed matrix + // data to the same location. That is considered worth it to avoid + // having one thread blocked on another one. Avoiding that is considered + // important especially on mobile, where there can be large speed + // discrepancy between threads, e.g. if different threads are scheduled + // on CPU cores of different types (big/little), different clock speed, + // different contention with other processes. + if (local_packed[side] && !local_packed[side][block[side]]) { + if (!packed[side][block[side]].load(std::memory_order_acquire)) { + params->RunPack(side, tuning, start, end); + TraceRecordBlockPacked(side, block_id, trace); + local_packed[side][block[side]] = true; + packed[side][block[side]].store(true, std::memory_order_release); + } + } + } + TrMulParams* params; const BlockMap& block_map; - std::atomic* atomic_n; + std::atomic* atomic_block_id; std::uint32_t thread_id; - std::atomic* lhs_packed; - std::atomic* rhs_packed; + SidePair*> packed; TuningResolver* tuning_resolver; Allocator* local_allocator; Trace* trace; @@ -169,16 +167,14 @@ LoopStructure GetLoopStructure(int thread_count, int rows, int cols, int depth, void TrMul(TrMulParams* params, Context* context) { gemmlowp::ScopedProfilingLabel label("TrMul"); - PMatrix& packed_lhs = params->packed_lhs; - PMatrix& packed_rhs = params->packed_rhs; - DMatrix& lhs = params->lhs; - DMatrix& rhs = params->rhs; + PMatrix& packed_lhs = params->packed[Side::kLhs]; + PMatrix& packed_rhs = params->packed[Side::kRhs]; + DMatrix& lhs = params->src[Side::kLhs]; + DMatrix& rhs = params->src[Side::kRhs]; const int rows = lhs.layout.cols; const int cols = rhs.layout.cols; const int depth = lhs.layout.rows; - const int rows_rounded_up = packed_lhs.layout.cols; - const int cols_rounded_up = packed_rhs.layout.cols; int thread_count = GetThreadCount(context, rows, cols, depth); const auto loop_structure = @@ -186,24 +182,30 @@ void TrMul(TrMulParams* params, Context* context) { params->cache_friendly_traversal_threshold); Allocator* allocator = context->GetMainAllocator(); - if (!params->lhs_is_prepacked) { - AllocatePMatrix(allocator, &packed_lhs); - } - if (!params->rhs_is_prepacked) { - AllocatePMatrix(allocator, &packed_rhs); + // Allocate packed matrices + for (Side side : {Side::kLhs, Side::kRhs}) { + if (!params->is_prepacked[side]) { + AllocatePMatrix(allocator, ¶ms->packed[side]); + } } + // Case of running this TrMul as a simple loop. + // This is a good place to start reading this function: all the rest + // of this function is just an optimized, but functionally equivalent, + // version of that. if (loop_structure == LoopStructure::kSimple) { gemmlowp::ScopedProfilingLabel label_simple("TrMulImpl, simple loop"); Tuning tuning = context->GetMainThreadTuning(); - if (!params->lhs_is_prepacked) { - params->LhsRunPack(tuning, 0, rows_rounded_up); + const SidePair origin{0, 0}; + const SidePair rounded_dims{packed_lhs.layout.cols, + packed_rhs.layout.cols}; + for (Side side : {Side::kLhs, Side::kRhs}) { + if (!params->is_prepacked[side]) { + params->RunPack(side, tuning, origin, rounded_dims); + } } - if (!params->rhs_is_prepacked) { - params->RhsRunPack(tuning, 0, cols_rounded_up); - } - params->RunKernel(tuning, 0, 0, rows_rounded_up, cols_rounded_up); + params->RunKernel(tuning, origin, rounded_dims); allocator->FreeAll(); return; @@ -216,54 +218,46 @@ void TrMul(TrMulParams* params, Context* context) { // Initialize block map. BlockMap block_map; - MakeBlockMap(rows_rounded_up, cols_rounded_up, depth, + MakeBlockMap(packed_lhs.layout.cols, packed_rhs.layout.cols, depth, packed_lhs.layout.kernel.cols, packed_rhs.layout.kernel.cols, packed_lhs.data_type.size, packed_rhs.data_type.size, params->cache_friendly_traversal_threshold, &block_map); - std::uint16_t num_blocks_of_rows = NumBlocksOfRows(block_map); - std::uint16_t num_blocks_of_cols = NumBlocksOfCols(block_map); - std::uint32_t num_blocks = NumBlocks(block_map); - RUY_DCHECK_EQ(num_blocks, num_blocks_of_rows * num_blocks_of_cols); // Initialize per-thread state. - thread_count = clamp(thread_count, 1, num_blocks); + thread_count = clamp(thread_count, 1, NumBlocks(block_map)); context->EnsureNPerThreadStates(thread_count); for (auto& per_thread_state : context->per_thread_states) { per_thread_state->tuning_resolver.SetTuning(context->explicit_tuning); } - // Allocate memory. - std::atomic* lhs_packed = nullptr; - if (!params->lhs_is_prepacked) { - allocator->Allocate(num_blocks_of_rows, &lhs_packed); + // Allocate and initialize atomic values tracking already-packed blocks. + SidePair*> packed{nullptr, nullptr}; + for (Side side : {Side::kLhs, Side::kRhs}) { + if (!params->is_prepacked[side]) { + const int size = NumBlocksPerSide(side, block_map); + allocator->Allocate(size, &packed[side]); + for (int i = 0; i < size; i++) { + packed[side][i].store(false, std::memory_order_release); + } + } } - std::atomic* rhs_packed = nullptr; - if (!params->rhs_is_prepacked) { - allocator->Allocate(num_blocks_of_cols, &rhs_packed); - } - std::atomic* atomic_n; - allocator->Allocate(1, &atomic_n); + + // Create the atomic block id, allocate it using Allocator so that + // we get the alignment ensuring that it sits alone in its exclusives + // reservation granule. + std::atomic* atomic_block_id; + allocator->Allocate(1, &atomic_block_id); + + // Create task objects. TrMulTask* tasks; allocator->Allocate(thread_count, &tasks); - // Initialize allocated data. - if (lhs_packed != nullptr) { - for (int i = 0; i < num_blocks_of_rows; i++) { - lhs_packed[i].store(false, std::memory_order_release); - } - } - if (rhs_packed != nullptr) { - for (int i = 0; i < num_blocks_of_cols; i++) { - rhs_packed[i].store(false, std::memory_order_release); - } - } - atomic_n->store(thread_count); + atomic_block_id->store(thread_count); for (int i = 0; i < thread_count; i++) { - new (tasks + i) - TrMulTask(params, block_map, atomic_n, i, lhs_packed, rhs_packed, - &context->per_thread_states[i]->tuning_resolver, - &context->per_thread_states[i]->allocator, trace); + new (tasks + i) TrMulTask(params, block_map, atomic_block_id, i, packed, + &context->per_thread_states[i]->tuning_resolver, + &context->per_thread_states[i]->allocator, trace); } // Do the computation. diff --git a/tensorflow/lite/experimental/ruy/trmul.h b/tensorflow/lite/experimental/ruy/trmul.h index 1a3872bc2ba..6f7d7ba4590 100644 --- a/tensorflow/lite/experimental/ruy/trmul.h +++ b/tensorflow/lite/experimental/ruy/trmul.h @@ -27,47 +27,10 @@ limitations under the License. #define TENSORFLOW_LITE_EXPERIMENTAL_RUY_TRMUL_H_ #include "tensorflow/lite/experimental/ruy/context.h" -#include "tensorflow/lite/experimental/ruy/internal_matrix.h" -#include "tensorflow/lite/experimental/ruy/kernel.h" -#include "tensorflow/lite/experimental/ruy/pack.h" -#include "tensorflow/lite/experimental/ruy/tune.h" +#include "tensorflow/lite/experimental/ruy/trmul_params.h" namespace ruy { -// Type-erased data needed for implementing TrMul. -struct TrMulParams { - // Helper functions for invoking the function pointers. - void LhsRunPack(Tuning tuning, int start_c, int end_c) { - lhs_run_pack(tuning, lhs, &packed_lhs, start_c, end_c); - } - void RhsRunPack(Tuning tuning, int start_c, int end_c) { - rhs_run_pack(tuning, rhs, &packed_rhs, start_c, end_c); - } - void RunKernel(Tuning tuning, int start_r, int start_c, int end_r, - int end_c) { - run_kernel(tuning, packed_lhs, packed_rhs, spec, start_r, start_c, end_r, - end_c, &dst); - } - - // Function pointers to type-erased entry points for kernels and packers. - RunPackFn* lhs_run_pack = nullptr; - RunPackFn* rhs_run_pack = nullptr; - RunKernelFn* run_kernel = nullptr; - - // Matrices and packed matrices. - DMatrix lhs; - DMatrix rhs; - DMatrix dst; - PMatrix packed_lhs; - PMatrix packed_rhs; - bool lhs_is_prepacked = false; - bool rhs_is_prepacked = false; - int cache_friendly_traversal_threshold = 0; - - // Type-erased Spec. - void* spec = nullptr; -}; - void TrMul(TrMulParams* params, Context* context); } // namespace ruy diff --git a/tensorflow/lite/experimental/ruy/trmul_params.h b/tensorflow/lite/experimental/ruy/trmul_params.h new file mode 100644 index 00000000000..49e60dacf66 --- /dev/null +++ b/tensorflow/lite/experimental/ruy/trmul_params.h @@ -0,0 +1,60 @@ +/* Copyright 2019 Google LLC. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_TRMUL_PARAMS_H_ +#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_TRMUL_PARAMS_H_ + +#include "tensorflow/lite/experimental/ruy/internal_matrix.h" +#include "tensorflow/lite/experimental/ruy/side_pair.h" +#include "tensorflow/lite/experimental/ruy/tune.h" + +namespace ruy { + +using RunKernelFn = void(Tuning, const SidePair&, void*, + const SidePair&, const SidePair&, DMatrix*); + +using RunPackFn = void(Tuning, const DMatrix&, PMatrix*, int, int); + +// Type-erased data needed for implementing TrMul. +struct TrMulParams { + TrMulParams() : run_pack{nullptr, nullptr}, is_prepacked{false, false} {} + // Helper functions for invoking the function pointers. + void RunPack(Side side, Tuning tuning, const SidePair& start, + const SidePair& end) { + run_pack[side](tuning, src[side], &packed[side], start[side], end[side]); + } + void RunKernel(Tuning tuning, const SidePair& start, + const SidePair& end) { + run_kernel(tuning, packed, spec, start, end, &dst); + } + + // Function pointers to type-erased entry points for kernels and packers. + SidePair run_pack; + RunKernelFn* run_kernel = nullptr; + + // Matrices and packed matrices. + SidePair src; + DMatrix dst; + SidePair packed; + SidePair is_prepacked; + int cache_friendly_traversal_threshold = 0; + + // Type-erased Spec. + void* spec = nullptr; +}; + +} // namespace ruy + +#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_TRMUL_PARAMS_H_ From f6b130616b05a2fe710372185ff85add523ee0fd Mon Sep 17 00:00:00 2001 From: "Joshua V. Dillon" Date: Thu, 25 Jul 2019 11:04:36 -0700 Subject: [PATCH 0581/3053] Ensure `run_all_in_graph_and_eager_modes` works in derived test classes. Currently `run_all_in_graph_and_eager_modes` uses `__dict__` which is an unreliable list of members. This change uses `dir` which is part of "Public Python". A consequence of this change is that some TensorFlow Probability (eager-mode only) tests must be temporarily disabled until they are fixed. PiperOrigin-RevId: 259981722 --- tensorflow/python/framework/test_util.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py index 4eaae126cef..2834d11d692 100644 --- a/tensorflow/python/framework/test_util.py +++ b/tensorflow/python/framework/test_util.py @@ -915,11 +915,14 @@ def generate_combinations_with_testcase_name(**kwargs): def run_all_in_graph_and_eager_modes(cls): """Execute all test methods in the given class with and without eager.""" base_decorator = run_in_graph_and_eager_modes - for name, value in cls.__dict__.copy().items(): - if callable(value) and name.startswith( - unittest.TestLoader.testMethodPrefix) and not ( - name.startswith("testSkipEager") or - name.startswith("test_skip_eager") or name == "test_session"): + for name in dir(cls): + if (not name.startswith(unittest.TestLoader.testMethodPrefix) or + name.startswith("testSkipEager") or + name.startswith("test_skip_eager") or + name == "test_session"): + continue + value = getattr(cls, name, None) + if callable(value): setattr(cls, name, base_decorator(value)) return cls From fcff61f085bd0984430800c446c2e56c39241e1e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 25 Jul 2019 11:06:13 -0700 Subject: [PATCH 0582/3053] Create a C++ string-ngrams op. PiperOrigin-RevId: 259982106 --- .../base_api/api_def_StringNGrams.pbtxt | 69 +++ .../python_api/api_def_StringNGrams.pbtxt | 4 + tensorflow/core/kernels/BUILD | 25 + tensorflow/core/kernels/string_ngrams_op.cc | 201 +++++++ .../core/kernels/string_ngrams_op_test.cc | 554 ++++++++++++++++++ tensorflow/core/ops/string_ops.cc | 22 + tensorflow/python/ops/ragged/BUILD | 12 + .../python/ops/ragged/ragged_string_ops.py | 137 +++++ .../ops/ragged/string_ngrams_op_test.py | 250 ++++++++ .../api/golden/v1/tensorflow.raw_ops.pbtxt | 4 + .../api/golden/v1/tensorflow.strings.pbtxt | 4 + .../api/golden/v2/tensorflow.raw_ops.pbtxt | 4 + .../api/golden/v2/tensorflow.strings.pbtxt | 4 + 13 files changed, 1290 insertions(+) create mode 100644 tensorflow/core/api_def/base_api/api_def_StringNGrams.pbtxt create mode 100644 tensorflow/core/api_def/python_api/api_def_StringNGrams.pbtxt create mode 100644 tensorflow/core/kernels/string_ngrams_op.cc create mode 100644 tensorflow/core/kernels/string_ngrams_op_test.cc create mode 100644 tensorflow/python/ops/ragged/string_ngrams_op_test.py diff --git a/tensorflow/core/api_def/base_api/api_def_StringNGrams.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringNGrams.pbtxt new file mode 100644 index 00000000000..d3d1a01ed37 --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_StringNGrams.pbtxt @@ -0,0 +1,69 @@ +op { + graph_op_name: "StringNGrams" + in_arg { + name: "data" + description: <